release code base

This commit is contained in:
Adnen Abdessaied 2024-02-20 16:31:21 +01:00
commit efbd43fed1
70 changed files with 4923 additions and 0 deletions

2
.gitattributes vendored Normal file
View File

@ -0,0 +1,2 @@
*.tar.gz filter=lfs diff=lfs merge=lfs -text
*.pt filter=lfs diff=lfs merge=lfs -text

90
README.md Normal file
View File

@ -0,0 +1,90 @@
<div align="center">
<h1> OLViT: Multi-Modal State Tracking via Attention-Based Embeddings for Video-Grounded Dialog </h1>
**[Adnen Abdessaied][4], &nbsp; [Manuel von Hochmeister][5], &nbsp; [Andreas Bulling][6]** <br> <br>
**COLING 2024**, Turin, Italy <img src="misc/italy.png" width="3%" align="center"> <br>
**[[Paper][7]]**
----------------
<img src="misc/teaser.png" width="40%" align="middle"><br><br>
</div>
# Table of Contents
* [Setup and Dependencies](#Setup-and-Dependencies)
* [Download Data](#Download-Data)
* [Training](#Training)
* [Testing](#Testing)
* [Results](#Results)
* [Acknowledgements](#Acknowledgements)
# Setup and Dependencies
We implemented our model using Python 3.7, PyTorch 1.11.0 (CUDA 11.3, CuDNN 8.3.2) and PyTorch Lightning. We recommend to setup a virtual environment using Anaconda. <br>
1. Install [git lfs][1] on your system
2. Clone our repository to download a checpint of our best model and our code
```shell
git lfs install
git clone this_repo.git
```
3. Create a conda environment and install dependencies
```shell
conda create -n olvit python=3.7
conda activate olvit
conda install pytorch==1.11.0 torchvision==0.12.0 torchaudio==0.11.0 cudatoolkit=11.3 -c pytorch
pip install pytorch-lightning==1.6.3
pip install transformers==4.19.2
pip install torchtext==0.12.0
pip install wandb nltk pandas
```
# Download Data
1. [DVD][2] and [SIMMC 2.1][3] data are included in this repository and will be downloaded using git lfs
2. Setup the data by executing
```shell
chmod u+x setup_data.sh
./setup_data.sh
```
3. This will unpack all the data necessary in ```data/dvd/``` and ```data/simmc/```
# Training
We trained our model on 3 Nvidia Tesla V100-32GB GPUs. The default hyperparameters need to be adjusted if your setup differs from ours.
## DVD
1. Adjust the config file for DVD according to your hardware specifications in ```config/dvd.json```
2. Execute
```shell
CUDA_VISIBLE_DEVICES=0,1,2 python train.py --cfg_path config/dvd.json
```
3. Checkpoints will be saved in ```checkpoints/dvd/```
## SIMMC 2.1
1. Adjust the config file for SIMMC 2.1 according to your hardware specifications in ```config/simmc.json```
2. Execute
```shell
CUDA_VISIBLE_DEVICES=0,1,2 python train.py --cfg_path config/simmc.json
```
3. Checkpoints will be saved in ```checkpoints/simmc/```
# Testing
1. Execute
```shell
CUDA_VISIBLE_DEVICES=0 python test.py --ckpt_path <PATH_TO_TRAINED_MODEL> --cfg_path <PATH_TO_CONFIG_OF_TRAINED_MODEL>
```
# Results
Training using the default config and a similar hardware setup as ours will result in the following performance
## DVD
<img src="misc/results_dvd.png" width="100%" align="middle"><br><br>
## SIMMC 2.1
<img src="misc/results_simmc.png" width="50%" align="middle"><br><br>
# Acknowledgements
Our work relied on the codebases of [DVD][2] and [SIMMC][3]. Thanks to the authors for sharing their code.
[1]: https://git-lfs.com/
[2]: https://github.com/facebookresearch/DVDialogues/
[3]: https://github.com/facebookresearch/simmc2/
[4]: https://perceptualui.org/people/abdessaied/
[5]: https://www.linkedin.com/in/manuel-von-hochmeister-285416202/
[6]: https://www.perceptualui.org/people/bulling/
[7]: https://drive.google.com/file/d/1sDFfGpQ9E9NahT5gw8UjknWt3sNdxM7p/view?usp=sharing

0
checkpoints/dvd/.gitkeep Normal file
View File

View File

0
config/__init__.py Normal file
View File

26
config/config.py Normal file
View File

@ -0,0 +1,26 @@
import json
import os
def read_default_config():
dirpath = os.path.dirname(__file__)
path = os.path.join(dirpath, "default.json")
with open(path) as config_file:
config = json.load(config_file)
return config
def read_config(path):
with open(path) as config_file:
config = json.load(config_file)
return config
def update_nested_dicts(old_dict, update_dict):
for key in update_dict:
if key in old_dict:
old_dict[key].update(update_dict[key])
else:
old_dict[key] = update_dict[key]
return old_dict

43
config/default.json Normal file
View File

@ -0,0 +1,43 @@
{
"wandb": {
"entity": "TO_BE_DEFINED",
"name": "",
"group": "",
"tags": [],
"project": "olvit"
},
"model": {
"model_type": "base_model",
"feature_type": "none",
"freeze_roberta": true,
"v_emb_dim": 16,
"dim_feedforward": 400,
"n_heads": 9,
"fc_dim": 128,
"dropout_p": 0.1,
"sample_rate_video": 10,
"n_encoder_layers": 6,
"add_choices_as_context": false,
"use_pretrained_lm": false,
"projection_as_in_aloe": false,
"pretrained_lm_name": ""
},
"training": {
"lr": 1e-4,
"total_steps": 200000,
"warmup_steps": 4000,
"accumulate_grad_batches": 1,
"batch_size": 128,
"epochs": 40,
"seed": null
},
"datamodule": {
"fea_dir": "data/dvd/monet_feats/",
"data_dir": "data/dvd/dialogs/"
},
"checkpoint": {
"checkpoint_folder": "checkpoints/",
"checkpoint_file_name": "olvit"
}
}

49
config/dvd.json Normal file
View File

@ -0,0 +1,49 @@
{
"wandb": {
"name": "olvit",
"group": "dvd",
"tags": [],
"project": "olvit"
},
"model": {
"model_type": "discriminative",
"n_heads": 6,
"v_emb_dim": 36,
"dim_feedforward": 200,
"dropout_p": 0.1,
"fc_dim": 512,
"sample_rate_video": 20,
"n_transf_layers": 4,
"use_pretrained_lm": true,
"projection_as_in_aloe": true,
"pretrained_lm_name": "distilroberta-base",
"dataset": "dvd"
},
"extended_model": {
"hist_len_for_state_gen": 7,
"number_of_relevant_emb": 2,
"num_layers_v_state": 2,
"num_layers_d_state": 2,
"combiner_option": "OptionA",
"state_tracker_type": "Transformer",
"use_v_state": true,
"use_d_state": true,
"n_heads_combiner_transformer": 8,
"n_heads_state_tracker": 6,
"dim_feedforward_v_transformer": 140,
"dim_feedforward_d_transformer": 60
},
"training": {
"lr": 1e-4,
"warmup_steps": 4000,
"total_steps": 200000,
"batch_size": 128,
"seed": 12345,
"epochs": 1000
},
"checkpoint": {
"checkpoint_folder": "checkpoints/dvd",
"checkpoint_file_name": "olvit"
}
}

61
config/simmc.json Normal file
View File

@ -0,0 +1,61 @@
{
"wandb": {
"name": "olvit",
"group": "simmc2",
"tags": [],
"project": "olvit"
},
"model": {
"model_type": "generative",
"dataset": "simmc2",
"feature_type": "object_text_features",
"object_feature_generator_dim": 50,
"n_object_feature_generator_layers": 2,
"n_heads": 6,
"v_emb_dim": 516,
"emb_dim": 216,
"dim_feedforward": 200,
"dropout_p": 0.1,
"fc_dim": 512,
"sample_rate_video": 1,
"n_encoder_layers": 4,
"n_decoder_layers": 4,
"use_pretrained_lm": true,
"vocab_size": 50265,
"projection_as_in_aloe": false,
"pretrained_lm_name": "distilroberta-base"
},
"extended_model": {
"hist_len_for_state_gen": 3,
"number_of_relevant_emb": 2,
"num_layers_v_state": 2,
"num_layers_d_state": 2,
"combiner_option": "OptionA",
"state_tracker_type": "Transformer",
"use_v_state": true,
"use_d_state": true,
"n_heads_combiner_transformer": 8,
"n_heads_state_tracker": 6,
"dim_feedforward_v_transformer": 140,
"dim_feedforward_d_transformer": 60
},
"training": {
"lr": 1e-4,
"warmup_steps": 4000,
"total_steps": 200000,
"batch_size": 8,
"seed": 12345,
"epochs": 1000
},
"datamodule": {
"fea_dir": "data/simmc/visual_features_resnet50_simmc2.1.pt",
"data_dir": "data/simmc/dialogs"
},
"checkpoint": {
"checkpoint_folder": "checkpoints/simmc/",
"checkpoint_file_name": "olvit",
"output_path": "output/simmc/",
"checkpoint_path": "TO_BE_DETERMINED"
}
}

3
data/dvd/dialogs.tar.gz Normal file
View File

@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:b1b58ee7af90b402eddbde8470dc0333b83ae293a90a93d26af3b8c39c2d9b0e
size 395953476

View File

@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:933c88dbf854d11fca34c388b1b566096b4f9733abd2ded0a1d381b4b1c6a379
size 1582620496

View File

@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:c07f88af54843010899ed1149d16343b9aeb38dbd2cb4e1977bb4c2436d461ec
size 1582620496

View File

@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:65ed3852c6bbe9f3135558f1bfd3900e8c37ae9af7b8338b3535987408086ca6
size 12956266

View File

@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:7f7aa24ce312e0cdbdb69021ce593aa985074e3ec88a737bc7af8060ff61d6a8
size 81394479

0
misc/.gitkeep Normal file
View File

BIN
misc/italy.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 3.9 KiB

BIN
misc/results_dvd.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 324 KiB

BIN
misc/results_simmc.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 34 KiB

BIN
misc/teaser.pdf Normal file

Binary file not shown.

BIN
misc/teaser.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 6.5 MiB

0
output/.gitkeep Normal file
View File

16
setup_data.sh Normal file
View File

@ -0,0 +1,16 @@
cd data/dvd
tar -xvzf dialogs.tar.gz
cat monet_feats_part* > monet_feats.tar.gz
tar -xvzf monet_feats.tar.gz
rm dialogs.tar.gz
rm monet_feats.tar.gz
rm monet_feats_part00.tar.gz
rm monet_feats_part01.tar.gz
cd ../simmc
tar -xvzf dialogs.tar.gz
rm dialogs.tar.gz
cd ../..

0
src/__init__.py Normal file
View File

25
src/combiner/option_a.py Normal file
View File

@ -0,0 +1,25 @@
import pytorch_lightning as pl
import torch
class CombinerOptionA(pl.LightningModule):
def __init__(self, config=None, model_input_dim=None, use_v_state=False, use_d_state=False):
super().__init__()
self.use_v_state = use_v_state
self.use_d_state = use_d_state
def forward(self, vision_emb, language_emb, language_emb_mask, v_state, d_state, dummy_word=None):
if v_state is not None \
and d_state is not None \
and self.use_v_state \
and self.use_d_state:
output = torch.concat([v_state, d_state, vision_emb, language_emb], axis=1)
elif d_state is not None and self.use_d_state:
output = torch.concat([d_state, vision_emb, language_emb], axis=1)
elif v_state is not None and self.use_v_state:
output = torch.concat([v_state, vision_emb, language_emb], axis=1)
else:
output = torch.concat([vision_emb, language_emb], axis=1)
if dummy_word is not None:
output = torch.concat([dummy_word, output], axis=1)
return output

38
src/combiner/option_b.py Normal file
View File

@ -0,0 +1,38 @@
import pytorch_lightning as pl
import torch
class CombinerOptionB(pl.LightningModule):
def __init__(self, config=None, model_input_dim=None, use_v_state=False, use_d_state=False):
super().__init__()
self.use_v_state = use_v_state
self.use_d_state = use_d_state
def append_state_to_emb(self, tensor, state):
tiling_vector = [1, tensor.shape[1], 1]
state_tensor_for_concatenation = torch.tile(state, tiling_vector)
result = torch.concat([tensor, state_tensor_for_concatenation], axis=2)
return result
def forward(self, dummy_word, video_emb, language_emb, language_emb_mask, v_state, d_state):
# concatenate the video emb with the video state and the language emb with the dialogue state
# if the stat is not used, concatenate itself
if v_state is not None \
and d_state is not None \
and self.use_v_state \
and self.use_d_state:
video_emb = self.append_state_to_emb(video_emb, v_state)
language_emb = self.append_state_to_emb(language_emb, d_state)
elif d_state is not None and self.use_d_state:
video_emb = self.append_state_to_emb(video_emb, video_emb)
language_emb = self.append_state_to_emb(language_emb, d_state)
elif v_state is not None and self.use_v_state:
video_emb = self.append_state_to_emb(video_emb, v_state)
language_emb = self.append_state_to_emb(language_emb, language_emb)
else:
video_emb = self.append_state_to_emb(video_emb, video_emb)
language_emb = self.append_state_to_emb(language_emb, language_emb)
output = torch.concat([dummy_word, video_emb, language_emb], axis=1)
return output

69
src/combiner/option_c.py Normal file
View File

@ -0,0 +1,69 @@
import pytorch_lightning as pl
import torch
from torch import nn
class CombinerOptionC(pl.LightningModule):
def __init__(self, config, model_input_dim, use_v_state, use_d_state):
super().__init__()
self.config = config
self.use_v_state = use_v_state
self.use_d_state = use_d_state
self.encoder_layer_d = nn.TransformerEncoderLayer(
d_model=model_input_dim,
dim_feedforward=self.config['dim_feedforward_d_transformer'],
batch_first=True,
nhead=self.config['n_heads_combiner_transformer']
)
self.encoder_layer_v = nn.TransformerEncoderLayer(
d_model=model_input_dim,
dim_feedforward=self.config['dim_feedforward_v_transformer'],
batch_first=True,
nhead=self.config['n_heads_combiner_transformer']
)
def prepare_inputs_for_transformers(self, video_emb, language_emb, language_emb_mask, v_state, d_state):
# create masks for the language inputs (video seq should all be 301 frames long and dont need padding)
d_input_mask = ~language_emb_mask # emb for pytorch needs to be True for masked tokens (opposite to huggingface mask)
# if the dialogue state is used, add a column of Falses at the beeginngin of the tensor (state should be attended -> no mask)
if d_state is not None and self.use_d_state:
zero_column = torch.zeros((d_input_mask.shape[0], 1), dtype=torch.bool, device=self.device)
d_input_mask = torch.concat([zero_column, d_input_mask],axis=1)
# prepare the input tensors for the different transformer layers depending on which state vectors should be used
if v_state is not None \
and d_state is not None \
and self.use_v_state \
and self.use_d_state:
v_input = torch.concat([v_state, video_emb], axis=1)
d_input = torch.concat([d_state, language_emb], axis=1)
elif d_state is not None and self.use_d_state:
v_input = video_emb
d_input = torch.concat([d_state, language_emb], axis=1)
elif v_state is not None and self.use_v_state:
v_input = torch.concat([v_state, video_emb], axis=1)
d_input = language_emb
else:
v_input = video_emb
d_input = language_emb
return v_input, d_input, d_input_mask
def forward(self, dummy_word, video_emb, language_emb, language_emb_mask, v_state, d_state):
# prepare the input tensors for the different transformer layers depending on which state vectors should be used
v_input, d_input, d_input_mask = self.prepare_inputs_for_transformers(video_emb, language_emb, language_emb_mask, v_state, d_state)
# apply the v transformer to the v input and the d transformer to the d input
v_emb = self.encoder_layer_v(v_input)
d_emb = self.encoder_layer_d(d_input, src_key_padding_mask=d_input_mask)
# combine the output of the first 2 transformers and add the dummy word (cls token)
# put the embedded video and dialog states at the beginning of the combined input
v_state_emb = v_emb[:, 0, :].unsqueeze(1)
d_state_emb = d_emb[:, 0, :].unsqueeze(1)
combined_input = torch.concat([dummy_word, v_state_emb, d_state_emb, v_emb[:, 1:, :], d_emb[:, 1:, :]], axis=1)
# create combined_input_mask based on the language_emb_mask
return combined_input

View File

View File

@ -0,0 +1,55 @@
import pytorch_lightning as pl
import src.utils.dvd_codebase.data.data_handler as dh
from src.utils.dvd_codebase.configs.configs import *
from transformers import AutoTokenizer
import os
class DVDData(pl.LightningDataModule):
def __init__(self, config):
super().__init__()
args.batch_size = config['training']['batch_size']
args.fea_dir = config['datamodule']['fea_dir']
args.data_dir = config['datamodule']['data_dir']
pretrained_lm_name = config['model']['pretrained_lm_name']
# load dialogues
self.train_dials, self.train_vids = dh.load_dials(args, "train")
self.val_dials, self.val_vids = dh.load_dials(args, "val")
self.test_dials, self.test_vids = dh.load_dials(args, "test")
# get vocabulary
self.vocab, self.answer_list = dh.get_vocabulary(self.train_dials, args)
# self.answer_list = ['0', '1', '10', '2', '3', '4', '5', '6', '7', '8', '9', 'False', 'True', 'blue', 'brown', 'cone', 'cube', 'cyan', 'cylinder', 'flying', 'flying,rotating', 'flying,rotating,sliding', 'flying,sliding', 'gold', 'gray', 'green', 'large', 'medium', 'metal', 'no action', 'purple', 'red', 'rotating', 'rotating,sliding', 'rubber', 'sliding', 'small', 'sphere', 'spl', 'yellow']
train_vft = dh.load_video_features(args, self.train_vids)
val_vft = dh.load_video_features(args, self.val_vids)
test_vft = dh.load_video_features(args, self.test_vids)
# create tokenizer
if pretrained_lm_name != '':
tokenizer = AutoTokenizer.from_pretrained(pretrained_lm_name)
pad_token_id = tokenizer.convert_tokens_to_ids(tokenizer.pad_token)
self.vocab['<blank>'] = pad_token_id
os.environ["TOKENIZERS_PARALLELISM"] = "false"
else:
tokenizer = None
# load data
self.train_dials = dh.create_dials(self.train_dials, self.vocab, self.answer_list, train_vft, args, tokenizer=tokenizer)
self.val_dials = dh.create_dials(self.val_dials, self.vocab, self.answer_list, val_vft, args, tokenizer=tokenizer)
self.test_dials = dh.create_dials(self.test_dials, self.vocab, self.answer_list, test_vft, args, tokenizer=tokenizer)
def train_dataloader(self):
dl, _ = dh.create_dataset(self.train_dials, self.vocab, "train", args)
return dl
def val_dataloader(self):
dl, _ = dh.create_dataset(self.val_dials, self.vocab, "val", args)
return dl
def test_dataloader(self):
dl, _ = dh.create_dataset(self.test_dials, self.vocab, "test", args)
return dl

View File

@ -0,0 +1,95 @@
import pytorch_lightning as pl
from src.utils.simmc2_dataset.dataloader_dvd_model import Simmc2Dataset, VisualFeatureLoader
from transformers import AutoTokenizer
import argparse
import os
from torch.utils.data import DataLoader
def parse_arguments():
parser = argparse.ArgumentParser()
parser.add_argument("--train_file", default='', help="Path to train file")
parser.add_argument("--dev_file", default='', help="Path to dev file")
parser.add_argument("--devtest_file", default='', help="Path to devtest file")
parser.add_argument(
"--visual_feature_path", default=None, help="Path to visual features"
)
parser.add_argument(
"--visual_feature_size",
type=int,
default=516,
help="Size of the visual features",
)
parser.add_argument(
"--max_turns", type=int, default=5, help="Number of turns in history"
)
parser.add_argument(
"--max_length", type=int, default=512, help="Maximum length in utterance"
)
parser.add_argument("--use_gpu", dest="use_gpu", action="store_true", default=True)
args = parser.parse_args()
return args
class Simmc2Data(pl.LightningDataModule):
def __init__(self, config):
super().__init__()
self.args = parse_arguments()
self.args.train_file = os.path.join(config['datamodule']['data_dir'], 'simmc2.1_ambiguous_candidates_dstc11_train.json')
self.args.dev_file = os.path.join(config['datamodule']['data_dir'], 'simmc2.1_ambiguous_candidates_dstc11_dev.json')
self.args.devtest_file = os.path.join(config['datamodule']['data_dir'], 'simmc2.1_ambiguous_candidates_dstc11_devtest.json')
self.args.teststd_file = os.path.join(config['datamodule']['data_dir'], 'simmc2.1_dials_dstc11_dev.json')
self.args.visual_feature_path = config['datamodule']['fea_dir']
pretrained_lm_name = config['model']['pretrained_lm_name']
self.tokenizer = AutoTokenizer.from_pretrained(pretrained_lm_name)
self.feature_loader = VisualFeatureLoader(
feature_path=self.args.visual_feature_path,
feature_size=self.args.visual_feature_size
)
self.config = config
def train_dataloader(self):
dataset = Simmc2Dataset(
tokenizer=self.tokenizer,
feature_loader=self.feature_loader,
load_path=self.args.train_file,
args=self.args
)
dl = DataLoader(
dataset,
batch_size=self.config['training']['batch_size'],
shuffle=True,
collate_fn=dataset.collate_fn,
)
return dl
def val_dataloader(self):
dataset = Simmc2Dataset(
tokenizer=self.tokenizer,
feature_loader=self.feature_loader,
load_path=self.args.dev_file,
args=self.args,
)
dl = DataLoader(
dataset,
batch_size=self.config['training']['batch_size'],
shuffle=False,
collate_fn=dataset.collate_fn,
)
return dl
def test_dataloader(self):
dataset = Simmc2Dataset(
tokenizer=self.tokenizer,
feature_loader=self.feature_loader,
load_path=self.args.devtest_file,
args=self.args,
)
dl = DataLoader(
dataset,
batch_size=self.config['training']['batch_size'],
shuffle=False,
collate_fn=dataset.collate_fn,
)
return dl

0
src/models/__init__.py Normal file
View File

179
src/models/base_model.py Normal file
View File

@ -0,0 +1,179 @@
import pytorch_lightning as pl
import torch
from torch import nn
from torch.optim import AdamW
from src.utils.positional_encoding import PositionalEncoding
from src.object_description_encoder.object_description_encoder import ObjectDescriptionEncoder
import torchmetrics as metrics
from transformers import get_cosine_schedule_with_warmup
from transformers import AutoModel
from src.combiner.option_a import CombinerOptionA
from transformers import AutoTokenizer
class TransformerModel(pl.LightningModule):
def __init__(self, config, output_path=None):
super().__init__()
self.output_path = output_path
self.config = config['model']
self.train_config = config['training']
self.train_acc = metrics.Accuracy('multiclass', num_classes=40)
self.val_acc = metrics.Accuracy('multiclass', num_classes=40)
self.test_acc = metrics.Accuracy('multiclass', num_classes=40)
self.best_val_acc = 0
self.loss_for_best_val_acc = 0
self.best_train_acc = 0
self.combiner = CombinerOptionA()
self.initialize_text_encoder_and_feature_mapping()
self.positional_encoder = PositionalEncoding(
d_model=self.model_input_dim, dropout=self.config['dropout_p'], max_len=self.config['dim_feedforward']
)
encoder_layer = nn.TransformerEncoderLayer(
d_model=self.model_input_dim,
batch_first=True,
dropout=self.config['dropout_p'],
dim_feedforward=self.config['dim_feedforward'],
nhead=self.config['n_heads']
)
self.encoder = nn.TransformerEncoder(
encoder_layer=encoder_layer,
num_layers=self.config['n_encoder_layers'],
)
self.loss = nn.CrossEntropyLoss()
if self.config['feature_type'] == 'object_text_features':
self.object_description_encoder = ObjectDescriptionEncoder(
d_model=self.config['v_emb_dim'],
config=self.config
)
# maps the output from the pretrained lm to as smaller size used for the encoding of the object description (reduces transformer size)
self.linear_projection_object_description = nn.Linear(self.pretrained_lm.config.hidden_size, self.config['v_emb_dim'])
# tokenizer for translation from ids to text
self.tokenizer = AutoTokenizer.from_pretrained(self.config['pretrained_lm_name'])
def initialize_text_encoder_and_feature_mapping(self):
if self.config['use_pretrained_lm']:
self.pretrained_lm = AutoModel.from_pretrained(
self.config['pretrained_lm_name'],
add_pooling_layer=False
)
self.pretrained_lm.eval()
# don't train the paramteres of the pretrained lm
self.pretrained_lm.config.training = True
# for param in self.pretrained_lm.parameters():
# param.requires_grad = False
# initialize the projection layers to map the embeddings to the correct input dim
# either use the emb_dim as done in aloe (v_emb_dim * n_heads) or the emb_dim specified in the config
if self.config['projection_as_in_aloe']:
self.model_input_dim = self.config['n_heads'] * self.config['v_emb_dim']
self.linear_projection_video = nn.Linear(self.config['v_emb_dim'], self.model_input_dim - 2)
self.linear_projection_text = nn.Linear(self.pretrained_lm.config.hidden_size, self.model_input_dim - 2)
else:
# take embedding size from config and map the video features from their size to the chose emb size
self.linear_projection_video = nn.Linear(self.config['v_emb_dim'], self.config['emb_dim'] - 2)
self.linear_projection_text = nn.Linear(self.pretrained_lm.config.hidden_size, self.config['emb_dim'] - 2)
self.model_input_dim = self.config['emb_dim']
else:
# either use the emb_dim as done in aloe (v_emb_dim * n_heads) or the video_emb_dim (2 is either added or subtracted because of the input ids)
if self.config['projection_as_in_aloe']:
self.model_input_dim = self.config['n_heads'] * self.config['v_emb_dim']
else:
self.model_input_dim = self.config['emb_dim']
self.linear_projection_video = nn.Linear(self.config['v_emb_dim'], self.model_input_dim - 2)
self.embed = nn.Embedding(num_embeddings=self.config['vocab_size'], embedding_dim=self.model_input_dim - 2)
def append_ids(self, tensor, id_vector, axis):
id_vector = torch.tensor(id_vector, device=self.device)
for a in range(len(tensor.shape)):
if a != axis:
id_vector = torch.unsqueeze(id_vector, axis=a)
tiling_vector = [s if i != axis else 1 for i, s in enumerate(tensor.shape)]
id_tensor = torch.tile(id_vector, tiling_vector)
return torch.concat([tensor, id_tensor], axis=axis)
def downsample_video_emb(self, video_emb):
return video_emb[:, ::self.config['sample_rate_video'], :, :]
def unroll_video_emb(self, video_emb):
video_emb = video_emb.permute(0, 1, 3, 2)
return torch.reshape(video_emb, (video_emb.shape[0], -1, video_emb.shape[3]))
def apply_pretrained_lm(self, query, query_mask):
output = self.pretrained_lm(
input_ids=query,
attention_mask=query_mask
)
return output['last_hidden_state']
def prepare_lang_emb(self, query, query_mask):
# set maximum query length TODO ------ set param in config
if query.shape[1] > 100:
query = query[:, :100]
query_mask = query_mask[:, :100]
# apply pretrained language model to embed the query if specified
if self.config['use_pretrained_lm']:
lang_emb = self.apply_pretrained_lm(query, query_mask)
else:
lang_emb = self.embed(query)
# Aloe uses an emb_dim of v_emb_dim * n_heads. Or use the emb_dim specified in the config
if self.config['use_pretrained_lm']:
lang_emb = self.linear_projection_text(lang_emb)
lang_emb = self.append_ids(lang_emb, [1, 0], 2)
lang_emb = self.positional_encoder(lang_emb)
return lang_emb
def prepare_video_emb(self, video_emb):
# shape: [batch, frames, v_emb_dim, objects]
video_emb = self.downsample_video_emb(video_emb)
# unroll time dimension in object dimension (only take every _ frame) - shape: [batch, objects x frames, v_emb_dim + 2]
video_emb = self.unroll_video_emb(video_emb)
# video_emb need to be projected to either the size of the language emb or the emb_size given by v_emb_dim * n_heads (As done in the Aloe paper)
#if self.config['use_pretrained_lm'] or self.config['projection_as_in_aloe']:
video_emb = self.linear_projection_video(video_emb)
video_emb = self.append_ids(video_emb, [0, 1], 2)
video_emb = self.positional_encoder(video_emb)
return video_emb
def forward(self, batch):
output = self.answer_query(batch.query, batch.query_mask, batch.vft)
return output
def configure_optimizers(self):
opt = AdamW(self.parameters(), lr=self.train_config['lr'])
sched = get_cosine_schedule_with_warmup(
opt,
num_warmup_steps=self.train_config['warmup_steps'],
num_training_steps=self.train_config['total_steps'],
)
return {
'optimizer': opt,
'lr_scheduler': {
'scheduler': sched,
'interval': 'step'
}
}

View File

@ -0,0 +1,137 @@
from src.models.state_tracker_model import StateTrackerModel
import torch
from torch import nn
from src.utils.text_utils import translate_from_ids_to_text
import pandas as pd
class DiscriminativeModel(StateTrackerModel):
def __init__(self, config, output_path=None):
super().__init__(config, output_path=output_path)
self.fc = nn.Linear(self.model_input_dim, self.config["fc_dim"])
self.relu = nn.ReLU()
self.output = nn.Linear(self.config["fc_dim"], 40)
def apply_model(self, language_emb, language_emb_mask, video_emb, v_state=None, d_state=None, answer_emb=None, answer_mask=None, state_generation_mode=None):
# analogous to the CLS token from BERT models
dummy_word = torch.zeros(self.model_input_dim, requires_grad=True, device=self.device)
dummy_word = torch.tile(dummy_word, (language_emb.shape[0], 1, 1))
# combine state and embeddings
input = self.combiner(
video_emb,
language_emb,
language_emb_mask,
v_state,
d_state,
dummy_word
)
# create input mask based on the language_emb_mask (complete video is unmasked)
input_mask = torch.zeros((input.shape[0], input.shape[1]), device=self.device)
offset = 1
if v_state is not None: offset += 1
if d_state is not None: offset += 1
# offset is caused by cls token and state vectors
if self.config['model_type'] == 'extended_model':
# set offset to 1 if combiner B is used -> no state vectors as input. Instead concatenated with embeddings
if self.ext_config['combiner_option'] == 'OptionB':
offset = 1
input_mask[:, video_emb.shape[1] + offset:] = ~language_emb_mask
x = self.encoder(input, src_key_padding_mask=input_mask)
# only pass transformed dummy word to the dense layers
x = self.fc(x[:, 0, :])
x = self.relu(x)
output = self.output(x)
return output
def answer_query(self, query, query_mask, vft, v_state=None, d_state=None, answer=None, answer_mask=None, state_generation_mode=False):
video_emb = self.prepare_video_emb(vft)
lang_emb = self.prepare_lang_emb(query, query_mask)
if answer is not None and answer_mask is not None:
answer_emb = self.prepare_lang_emb(answer, answer_mask)
else:
answer_emb = None
output = self.apply_model(lang_emb, query_mask, video_emb, v_state, d_state, answer_emb, answer_mask, state_generation_mode)
return output
def training_step(self, train_batch, batch_idx):
train_batch.move_to_cuda()
label = torch.squeeze(train_batch.answer)
out = self.forward(train_batch)
loss = self.loss(out, label)
tr_acc = self.train_acc(out.softmax(dim=1), label)
if tr_acc > self.best_train_acc:
self.best_train_acc = tr_acc
self.log("train_acc", tr_acc, prog_bar=True, on_step=False, on_epoch=True, batch_size=train_batch.query.shape[0])
self.log("train_loss", loss, prog_bar=True, on_step=False, on_epoch=True, batch_size=train_batch.query.shape[0])
print('train_loss: {} | train_acc: {}'.format(loss, tr_acc))
return loss
def validation_step(self, val_batch, batch_idx):
val_batch.move_to_cuda()
label = torch.squeeze(val_batch.answer)
out = self.forward(val_batch)
loss = self.loss(out, label)
self.val_acc(out.softmax(dim=1), label)
self.log("val_acc", self.val_acc, prog_bar=True, on_step=False, on_epoch=True, batch_size=val_batch.query.shape[0])
self.log("val_loss", loss, prog_bar=True, on_step=False, on_epoch=True, batch_size=val_batch.query.shape[0])
return {'val_loss': loss, 'val_acc': self.val_acc.compute()}
def test_step(self, test_batch, batch_idx):
test_batch.move_to_cuda()
label = torch.squeeze(test_batch.answer)
out = self.forward(test_batch)
loss = self.loss(out, label)
self.test_acc(out.softmax(dim=1), label)
self.log("test_acc", self.test_acc, prog_bar=True, on_step=False, on_epoch=True, batch_size=test_batch.query.shape[0])
self.log("test_loss", loss, prog_bar=True, on_step=False, on_epoch=True, batch_size=test_batch.query.shape[0])
# save the results into a dictionary
out = torch.argmax(out, dim=1)
question_as_text = []
for i in range(test_batch.query.shape[0]):
question_ids = test_batch.query[i, :]
question_as_text.append(translate_from_ids_to_text(question_ids, self.tokenizer))
self.results['question'].extend(question_as_text)
self.results['video_name'].extend(test_batch.video_name)
self.results['qa_id'].extend(test_batch.qa_ids)
self.results['q_type'].extend(test_batch.q_type)
self.results['label'].extend(label.tolist())
self.results['output'].extend(out.tolist())
self.results['attribute_dependency'].extend(test_batch.attribute_dependency)
self.results['object_dependency'].extend(test_batch.object_dependency)
self.results['temporal_dependency'].extend(test_batch.temporal_dependency)
self.results['spatial_dependency'].extend(test_batch.spatial_dependency)
self.results['q_complexity'].extend(test_batch.q_complexity)
def on_test_start(self):
self.results = {
'qa_id': [],
'q_type': [],
'label': [],
'output': [],
'attribute_dependency': [],
'object_dependency': [],
'temporal_dependency': [],
'spatial_dependency': [],
'q_complexity': [],
# only needed for input output analysis
'question': [],
'video_name': []
}
def on_test_end(self):
df = pd.DataFrame.from_dict(self.results)
df.to_pickle(self.output_path)

View File

@ -0,0 +1,350 @@
# code is partly inspired from https://pytorch.org/tutorials/beginner/translation_transformer.html
from unittest import result
from src.models.state_tracker_model import StateTrackerModel
from src.utils.batch_interfaces import batch_interface_simmc2_to_dvd, batch_interface_avsd_to_dvd
from dataclasses import dataclass
import torch
from torch import nn
from torchtext.data.metrics import bleu_score
import json
import os
from transformers import AutoTokenizer
import nltk
import numpy as np
from src.utils.text_utils import normalize_sentence, translate_from_ids_to_text
class GenerativeModel(StateTrackerModel):
def __init__(self, config, output_path=None):
super().__init__(config, output_path=output_path)
self.transformer = nn.Transformer(
d_model=self.model_input_dim,
batch_first=True,
dropout=self.config['dropout_p'],
dim_feedforward=self.config['dim_feedforward'],
nhead=self.config['n_heads'],
num_encoder_layers=self.config['n_encoder_layers'],
num_decoder_layers=self.config['n_decoder_layers'],
custom_encoder=self.encoder
)
self.prob_generator = nn.Linear(self.model_input_dim, self.config['vocab_size'])
self.pad_id = 1
self.unk_id = 3
self.loss = nn.CrossEntropyLoss(ignore_index=self.pad_id)
# tokenizer for translation from ids to text
self.tokenizer = AutoTokenizer.from_pretrained(self.config['pretrained_lm_name'])
# ---TODO: Remove ------
self.results = {}
self.epoch_count = 0
# -----------------------
self.batch_interface = batch_interface_simmc2_to_dvd
def encode_object_descriptions(self, vft):
#embed the object descriptions using bert and then create the object token using transformer layers
if self.config['feature_type'] == "object_text_features":
object_features = []
for i in range(vft.shape[1]):
object_description = vft[:, i, :]
object_description_mask = (object_description != 1)
embedded_object_description = self.apply_pretrained_lm(object_description, object_description_mask)
#map embeddings to a smaller size (motivation: reduce transformer sice of object description encoder)
embedded_object_description = self.linear_projection_object_description(embedded_object_description)
#apply transformer to encode the object description
object_token = self.object_description_encoder(embedded_object_description)
object_features.append(object_token)
object_features = torch.concat(object_features, dim=1)
#add frame dimension (only one frame in this cas)
object_features = object_features.unsqueeze(1)
#bring the data to the format [batch_size x frames x emb_dim (desc_text_len) x obj_number]
vft = object_features.permute(0, 1, 3, 2)
return vft
def create_target_mask(self, size):
mask = torch.triu(torch.ones((size,size), device=self.device), 1)
mask = mask.masked_fill(mask == 1, float('-inf'))
return mask
def generate_prob_for_next_tokens(self, input, answer_emb, tgt_mask, input_mask, answer_mask):
x = self.transformer.encoder(input, src_key_padding_mask=input_mask)
dec_out = self.transformer.decoder(answer_emb, x, tgt_mask)
probs = self.prob_generator(dec_out)
return probs
def generate_complete_answers(self, input, input_mask):
# encode the complete batch of questions
memory = self.transformer.encoder(input, src_key_padding_mask=input_mask)
generated_answers = torch.ones(memory.shape[0], 40, dtype=torch.int) # 20 = max answer length, use unknown token ()
# generate the answers for each individual question from the batch
for i in range(memory.shape[0]):
memory_i = memory[i, :, :]
memory_i = memory_i.unsqueeze(0)
answer_i = torch.zeros((1,1), dtype=torch.int, device=self.device) # Pass start token <s> to decoder as first input. From roberta vocab: <s>": 0, "</s>": 2
for j in range(40): # 20 = max answer length
answer_i_emb = self.prepare_lang_emb(answer_i, torch.ones((1, answer_i.shape[0]), device=self.device, dtype=torch.int16))
tgt_mask = self.create_target_mask(answer_i.shape[1])
decoder_output = self.transformer.decoder(answer_i_emb, memory_i, tgt_mask)
prob = self.prob_generator(decoder_output[:, -1, :])
next_word = prob.argmax()
answer_i = torch.concat([answer_i, next_word.unsqueeze(0).unsqueeze(0)], dim=1)
if next_word.item() == 2: # eos token in roberta vocab "</s>": 2
break
generated_answers[i, :answer_i.shape[1] - 1] = answer_i[0, 1:]
return generated_answers
def apply_model(self, language_emb, language_emb_mask, video_emb, v_state=None, d_state=None, answer_emb=None, answer_mask=None, state_generation_mode=False):
# combine state and embeddings
input = self.combiner(
video_emb,
language_emb,
language_emb_mask,
v_state,
d_state
)
# create input mask based on the language_emb_mask (complete video is unmasked)
input_mask = torch.zeros((input.shape[0], input.shape[1]), device=self.device)
offset = 0
if v_state is not None: offset += 1
if d_state is not None: offset += 1
# offset is caused by state vectors
input_mask[:, video_emb.shape[1] + offset:] = ~language_emb_mask
tgt_mask = self.create_target_mask(answer_emb.shape[1])
#-------TODO: Mask padded object embeddings when text based object embeddings are used -------------
if self.mode == 'train' or state_generation_mode:
probs = self.generate_prob_for_next_tokens(input, answer_emb, tgt_mask, input_mask, answer_mask)
return probs
elif self.mode == 'val':
generated_answers = self.generate_complete_answers(input, input_mask)
return generated_answers
def prepare_answer_emb_and_mask(self, answer, answer_mask):
mask = torch.tril(torch.ones((answer.shape[1], answer.shape[1]), device=self.device))
mask = mask.unsqueeze(0)
mask = mask.expand(answer.shape[0], -1, -1)
answer_emb = self.apply_pretrained_lm(answer, mask)
answer_emb = self.linear_projection_text(answer_emb)
answer_emb = self.append_ids(answer_emb, [1, 0], 2)
answer_emb = self.positional_encoder(answer_emb)
# pytorch interprets True in a mask as padding
answer_mask = ~answer_mask
answer_emb_final = answer_emb[:, :-1].detach()
answer_mask_final = answer_mask[:, :-1].detach()
return answer_emb_final, answer_mask_final
def answer_query(self, query, query_mask, vft, v_state=None, d_state=None, answer=None, answer_mask=None, state_generation_mode=False):
video_emb = self.prepare_video_emb(vft)
lang_emb = self.prepare_lang_emb(query, query_mask)
answer_emb, answer_mask = self.prepare_answer_emb_and_mask(answer, answer_mask)
output = self.apply_model(lang_emb, query_mask, video_emb, v_state, d_state, answer_emb, answer_mask, state_generation_mode)
return output
def training_step(self, train_batch, batch_idx):
train_batch = self.batch_interface(train_batch, feature_type=self.config['feature_type'])
if self.config['feature_type'] == "object_text_features":
train_batch.vft = self.encode_object_descriptions(train_batch.vft)
logits = self.forward(train_batch)
logits = logits.permute(0, 2, 1)
# replace any unknown token (id = 3) with a padding token in order to also ignore them -> avoid model which outputs unk tokens
train_batch.answer[train_batch.answer == 3] = 1
loss = self.loss(logits, train_batch.answer[:, 1:]) # ignore padding
self.log("train_loss", loss, prog_bar=True, on_step=False, on_epoch=True, batch_size=train_batch.query.shape[0])
return loss
def get_next_token_pred_as_text_and_logits(self, batch):
# set mode to train to get the logits instead of completely generated sentences
self.mode = 'train'
logits = self.forward(batch)
logits = logits.permute(0, 2, 1)
predicted_tokens = []
for j in range(logits.shape[0]):
l = logits[j, :, :]
ids = [l[:, i].argmax().item() for i in range(l.shape[1])]
text = translate_from_ids_to_text(ids, self.tokenizer)
predicted_tokens.append(text)
# set mode back to val
self.mode = 'val'
return predicted_tokens, logits
def calculate_bleu_score(self, generated_answer_ids, correct_answer):
# calculate bleu score for the generated answers compared to the provided correct answers
bleu4_scores = []
all_generated_answers = []
for i in range(generated_answer_ids.shape[0]):
generated_answer = generated_answer_ids[i, :].tolist()
generated_answer_text = translate_from_ids_to_text(generated_answer, self.tokenizer)
all_generated_answers.append(generated_answer_text)
correct_answer_text_i = correct_answer[i]
score4 = nltk.translate.bleu_score.sentence_bleu(
[normalize_sentence(correct_answer_text_i)],
normalize_sentence(generated_answer_text),
smoothing_function=nltk.translate.bleu_score.SmoothingFunction().method7
)
bleu4_scores.append(score4)
bleu4_score = np.mean(bleu4_scores)
return bleu4_score, all_generated_answers
def translate_answer_ids_to_text(self, answer):
correct_answer_text = []
for i in range(answer.shape[0]):
correct_answer_i = answer[i, :].tolist()
correct_answer_text_i = translate_from_ids_to_text(correct_answer_i, self.tokenizer)
correct_answer_text.append(correct_answer_text_i)
return correct_answer_text
def validation_step(self, val_batch, batch_idx):
val_batch = self.batch_interface(val_batch, feature_type=self.config['feature_type'])
if self.config['feature_type'] == "object_text_features":
val_batch.vft = self.encode_object_descriptions(val_batch.vft)
correct_answer_text = self.translate_answer_ids_to_text(val_batch.answer)
generated_answer_ids = self.forward(val_batch)
# calculate and log bleu score for the generated answers compared to the provided correct answers
bleu4_score, generated_answers_text = self.calculate_bleu_score(generated_answer_ids, correct_answer_text)
self.log('bleu4', bleu4_score, prog_bar=True, on_step=False, on_epoch=True, batch_size=generated_answer_ids.shape[0])
# calculate and log the validation loss based on the results from next token predicition (train mode needed)
predicted_tokens, logits = self.get_next_token_pred_as_text_and_logits(val_batch)
loss = self.loss(logits, val_batch.answer[:, 1:]) # ignore padding
self.log("val_loss", loss, prog_bar=True, on_step=False, on_epoch=True, batch_size=val_batch.query.shape[0])
return {'next_token_predictions': predicted_tokens, 'generated_answers': generated_answers_text, 'correct_answers': correct_answer_text}
def test_step(self, test_batch, batch_idx):
dialog_id = test_batch['dialog_id']
turn_id = test_batch['turn_id']
test_batch = self.batch_interface(test_batch, feature_type=self.config['feature_type'])
if self.config['feature_type'] == "object_text_features":
test_batch.vft = self.encode_object_descriptions(test_batch.vft)
correct_answer_text = self.translate_answer_ids_to_text(test_batch.answer)
generated_answer_ids = self.forward(test_batch)
# calculate and log bleu score for the generated answers compared to the provided correct answers
bleu4_score, generated_answers_text = self.calculate_bleu_score(generated_answer_ids, correct_answer_text)
self.log('bleu4', bleu4_score, prog_bar=True, on_step=False, on_epoch=True, batch_size=generated_answer_ids.shape[0])
# calculate and log the validation loss based on the results from next token predicition (train mode needed)
predicted_tokens, logits = self.get_next_token_pred_as_text_and_logits(test_batch)
loss = self.loss(logits, test_batch.answer[:, 1:]) # ignore padding
self.log("val_loss", loss, prog_bar=True, on_step=False, on_epoch=True, batch_size=test_batch.query.shape[0])
return {'turn_id': turn_id, 'next_token_predictions': predicted_tokens, 'dialog_id': dialog_id, 'generated_answers': generated_answers_text, 'correct_answers': correct_answer_text}
def test_epoch_end(self, outputs):
if self.config['output_format'] == 'submission':
responses = []
for output in outputs:
for t_id, d_id, answer in zip(output['turn_id'], output['dialog_id'], output['generated_answers']):
sample = {
'dialog_id': d_id,
'predictions': [
{
'turn_id': t_id,
'response': answer
}
]
}
responses.append(sample)
name = 'dstc11-simmc-devtest-pred-subtask-4-generation.json'
with open(os.path.join(self.output_path, name), 'w') as file:
json.dump(responses, file)
else:
result_idx = 0
for output in outputs:
for j in range(len(output['next_token_predictions'])):
pred = " "
corr = " "
gen = " "
self.results[result_idx] = {
'next_token_pred': pred.join(output['next_token_predictions'][j]),
'generated_ans': gen.join(output['generated_answers'][j]),
'correct': corr.join(output['correct_answers'][j])
}
result_idx += 1
name = f'epoch_{self.epoch_count}.json'
with open(os.path.join(self.output_path, name), 'w') as file:
json.dump(self.results, file)
def validation_epoch_end(self, outputs):
result_idx = 0
for output in outputs:
for j in range(len(output['next_token_predictions'])):
pred = " "
corr = " "
gen = " "
self.results[result_idx] = {
'next_token_pred': pred.join(output['next_token_predictions'][j]),
'generated_ans': gen.join(output['generated_answers'][j]),
'correct': corr.join(output['correct_answers'][j])
}
result_idx += 1
name = f'epoch_{self.epoch_count}.json'
with open(os.path.join(self.output_path, name), 'w') as file:
json.dump(self.results, file)
self.results = {}
self.epoch_count += 1
def on_train_epoch_start(self):
self.mode = 'train'
def on_validation_epoch_start(self):
self.mode = 'val'
def on_test_epoch_start(self):
self.mode = 'val'

View File

@ -0,0 +1,167 @@
import pytorch_lightning as pl
import torch
from torch import nn
from src.models.base_model import TransformerModel
from src.utils.save_attention_weights import SaveOutput
from src.utils.custom_transformer_encoder_layer import CustomTransformerEncoderLayer
from src.state_trackers.video_state_tracker import VstLSTM
from src.state_trackers.dialogue_state_tracker import DstLSTM
from src.state_trackers.vst_transformer_based import VstTransformer
from src.state_trackers.dst_transformer_based import DstTransformer
from src.combiner.option_a import CombinerOptionA
from src.combiner.option_b import CombinerOptionB
from src.combiner.option_c import CombinerOptionC
class StateTrackerModel(TransformerModel):
def __init__(self, config, output_path=None):
super().__init__(config, output_path=output_path)
self.config = config['model']
self.ext_config = config['extended_model']
combine_state_and_emb_options = {
'OptionA': CombinerOptionA,
'OptionB': CombinerOptionB,
'OptionC': CombinerOptionC,
}
state_tracker_options = {
'Transformer': {
'vst': VstTransformer,