commit b5f3b728c3c94d091030277eea98c9e31f152db6 Author: Adnen Abdessaied Date: Wed Mar 30 10:46:35 2022 +0200 Initial commit diff --git a/README.md b/README.md new file mode 100644 index 0000000..28437b2 --- /dev/null +++ b/README.md @@ -0,0 +1,124 @@ +This is the official code of the paper **Video Language Co-Attention with Fast-Learning Feature Fusion for VideoQA**. +If you find our code useful, please cite our paper: + +# Overview +

drawing

+ +# Results +Our VLCN model achieves **new** state-of-the-art results on two open-ended VideoQA datasets **MSVD-QA** and **MSRVTT-QA**. +#### MSVD-QA +|
Model
|
What
|
Who
|
How
|
When
|
Where
|
All
| +| :---: | :---: | :---: | :---: | :---: | :---: | :---: | +| ST-VQA | 18.10 | 50.00 | **83.80** | 72.40 | 28.60 | 31.30 | +| Co-Mem | 19.60 | 48.70 | 81.60 | 74.10 | 31.70 | 31.70 | +| HMEMA | 22.40 | 50.00 | 73.00 | 70.70 | 42.90 | 33.70 | +| SSML | - | - | - | - | - | 35.13 | +| QueST | 24.50 | **52.90** | 79.10 | 72.40 | **50.00** | 36.10 | +| HCRN | - | - | - | - | - | 36.10 | +| MA-DRNN | 24.30 | 51.60 | 82.00 | **86.30** | 26.30 | 36.20 | +| **VLCN (Ours)** | **28.42** | 51.29 | 81.08 | 74.13 | 46.43 | **38.06** | + +#### MSRVTT-QA +|
Model
|
What
|
Who
|
How
|
When
|
Where
|
All
| +| :---: | :---: | :---: | :---: | :---: | :---: | :---: | +| ST-VQA | 24.50 | 41.20 | 78.00 | 76.50 | 34.90 | 30.90 | +| Co-Mem | 23.90 | 42.50 | 74.10 | 69.00 | **42.90** | 32.00 | +| HMEMA | 22.40 | **50.10** | 73.00 | 70.70 | 42.90 | 33.70 | +| QueST | 27.90 | 45.60 | **83.00** | 75.70 | 31.60 | 34.60 | +| SSML | - | - | - | - | - | 35.00 | +| HCRN | - | - | - | - | - | 35.60 | +| **VLCN (Ours)** | **30.69** | 44.09 | 79.82 | **78.29** | 36.80 | **36.01** | + +# Requirements +- PyTorch 1.3.1
+- Torchvision 0.4.2
+- Python 3.6 + +# Raw data +The raw data of MSVD-QA and MSRVTT-QA are located in +`` +data/MSVD-QA +`` +and +`` +data/MSRVTT-QA +`` +, respectively.
+ +**Videos:** The raw videos of MSVD-QA and MSRVTT-QA can be downloaded from [⬇](https://www.cs.utexas.edu/users/ml/clamp/videoDescription/) and [⬇](https://www.mediafire.com/folder/h14iarbs62e7p/shared), respectively.
+**Text:** The text data can be downloaded from [⬇](https://github.com/xudejing/video-question-answering).
+ +After downloading all the raw data, `` +data/MSVD-QA +`` +and +`` +data/MSRVTT-QA +`` +should have the following structure: +

PHP Terminal style set text color

+ +# Preprocessing +To sample the individual frames and clips and generate the corresponding visual features, we run the script +`` +preporocess.py +`` +on the raw videos with the appropriate flags. E.g. for MSVD-QA we have to execute +```bash +python core/data/preporocess.py --RAW_VID_PATH /data/MSRVD-QA/videos --C3D_PATH path_to_pretrained_c3d +``` +This will save the individual frames and clips in +`` +data/MSVD-QA/frames +`` +and +`` +data/MSVD-QA/clips +`` +, respectively, and their visual features in + +`` +data/MSVD-QA/frame_feat +`` +and +`` +data/MSVD-QA/clip_feat +``, respectively. + +# Config files +Before starting training, one has to update the config path file +`` +cfgs/path_cfgs.py +`` +with the paths of the raw data as well as the visual feaures.
+All Hyperparameters can be adjusted in +`` +cfgs/base_cfgs.py +``. + +# Training +To start training, one has to specify an experiment directory +`` +EXP_NAME +`` +where all the results (log files, checkpoints, tensorboard files etc) will be saved. Futhermore, one needs to specify the +`` +MODEL_TYPE +`` +of the VLCN to be trained. +|
MODEL_TYPE
|
Description
| +| :---: | :---: | +| 1 | VLCN | +| 2 | VLCN-FLF | +| 3 | VLCV+LSTM | +| 4 | MCAN | + +These parameters can be set inline. E.g. by executing +```bash +python run.py --EXP_NAME experiment --MODEL_TYPE 1 --DATA_PATH /data/MSRVD-QA --GPU 1 --SEED 42 +``` +# Pre-trained models +Our pre-trained models are available here [⬇](https://drive.google.com/drive/folders/172yj4iUkF1U1WOPdA5KuKOTQXkgzFEzS) + +# Acknowledgements +We thank the Vision and Language Group@ MIL for their [MCAN](https://github.com/MILVLG/mcan-vqa) open source implementation, [DavidA](https://github.com/DavideA/c3d-pytorch/blob/master/C3D_model.py) for his pretrained C3D model and finally [ixaxaar](https://github.com/ixaxaar/pytorch-dnc) for his DNC implementation. diff --git a/assets/.gitkeep b/assets/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/assets/overview_project_one.png b/assets/overview_project_one.png new file mode 100644 index 0000000..e9c9922 Binary files /dev/null and b/assets/overview_project_one.png differ diff --git a/assets/structure.png b/assets/structure.png new file mode 100644 index 0000000..fd000e3 Binary files /dev/null and b/assets/structure.png differ diff --git a/cfgs/.gitkeep b/cfgs/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/cfgs/base_cfgs.py b/cfgs/base_cfgs.py new file mode 100644 index 0000000..a6e8c12 --- /dev/null +++ b/cfgs/base_cfgs.py @@ -0,0 +1,267 @@ +# -------------------------------------------------------- +# mcan-vqa (Deep Modular Co-Attention Networks) +# Licensed under The MIT License [see LICENSE for details] +# Written by Yuhao Cui https://github.com/cuiyuhao1996 +# -------------------------------------------------------- + +from cfgs.path_cfgs import PATH + +import os, torch, random +import numpy as np +from types import MethodType + + +class Cfgs(PATH): + def __init__(self, EXP_NAME, DATASET_PATH): + super(Cfgs, self).__init__(EXP_NAME, DATASET_PATH) + + # Set Devices + # If use multi-gpu training, set e.g.'0, 1, 2' instead + self.GPU = '0' + + # Set RNG For CPU And GPUs + self.SEED = random.randint(0, 99999999) + + # ------------------------- + # ---- Version Control ---- + # ------------------------- + + # Define a specific name to start new training + # self.VERSION = 'Anonymous_' + str(self.SEED) + self.VERSION = str(self.SEED) + + # Resume training + self.RESUME = False + + # Used in Resume training and testing + self.CKPT_VERSION = self.VERSION + self.CKPT_EPOCH = 0 + + # Absolutely checkpoint path, 'CKPT_VERSION' and 'CKPT_EPOCH' will be overridden + self.CKPT_PATH = None + + # Print loss every step + self.VERBOSE = True + + + # ------------------------------ + # ---- Data Provider Params ---- + # ------------------------------ + + # {'train', 'val', 'test'} + self.RUN_MODE = 'train' + + # Set True to evaluate offline + self.EVAL_EVERY_EPOCH = True + + # # Define the 'train' 'val' 'test' data split + # # (EVAL_EVERY_EPOCH triggered when set {'train': 'train'}) + # self.SPLIT = { + # 'train': '', + # 'val': 'val', + # 'test': 'test', + # } + # # A external method to set train split + # self.TRAIN_SPLIT = 'train+val+vg' + + # Set True to use pretrained word embedding + # (GloVe: spaCy https://spacy.io/) + self.USE_GLOVE = True + + # Word embedding matrix size + # (token size x WORD_EMBED_SIZE) + self.WORD_EMBED_SIZE = 300 + + # Max length of question sentences + self.MAX_TOKEN = 15 + + # VGG 4096D features + self.FRAME_FEAT_SIZE = 4096 + + # C3D 4096D features + self.CLIP_FEAT_SIZE = 4096 + + self.NUM_ANS = 1000 + + # Default training batch size: 64 + self.BATCH_SIZE = 64 + + # Multi-thread I/O + self.NUM_WORKERS = 8 + + # Use pin memory + # (Warning: pin memory can accelerate GPU loading but may + # increase the CPU memory usage when NUM_WORKS is large) + self.PIN_MEM = True + + # Large model can not training with batch size 64 + # Gradient accumulate can split batch to reduce gpu memory usage + # (Warning: BATCH_SIZE should be divided by GRAD_ACCU_STEPS) + self.GRAD_ACCU_STEPS = 1 + + # Set 'external': use external shuffle method to implement training shuffle + # Set 'internal': use pytorch dataloader default shuffle method + self.SHUFFLE_MODE = 'external' + + + # ------------------------ + # ---- Network Params ---- + # ------------------------ + + # Model deeps + # (Encoder and Decoder will be same deeps) + self.LAYER = 6 + + # Model hidden size + # (512 as default, bigger will be a sharp increase of gpu memory usage) + self.HIDDEN_SIZE = 512 + + # Multi-head number in MCA layers + # (Warning: HIDDEN_SIZE should be divided by MULTI_HEAD) + self.MULTI_HEAD = 8 + + # Dropout rate for all dropout layers + # (dropout can prevent overfitting: [Dropout: a simple way to prevent neural networks from overfitting]) + self.DROPOUT_R = 0.1 + + # MLP size in flatten layers + self.FLAT_MLP_SIZE = 512 + + # Flatten the last hidden to vector with {n} attention glimpses + self.FLAT_GLIMPSES = 1 + self.FLAT_OUT_SIZE = 1024 + + + # -------------------------- + # ---- Optimizer Params ---- + # -------------------------- + + # The base learning rate + self.LR_BASE = 0.0001 + + # Learning rate decay ratio + self.LR_DECAY_R = 0.2 + + # Learning rate decay at {x, y, z...} epoch + self.LR_DECAY_LIST = [10, 12] + + # Max training epoch + self.MAX_EPOCH = 30 + + # Gradient clip + # (default: -1 means not using) + self.GRAD_NORM_CLIP = -1 + + # Adam optimizer betas and eps + self.OPT_BETAS = (0.9, 0.98) + self.OPT_EPS = 1e-9 + self.OPT_WEIGHT_DECAY = 1e-5 + # -------------------------- + # ---- DNC Hyper-Params ---- + # -------------------------- + self.IN_SIZE_DNC = self.HIDDEN_SIZE + self.OUT_SIZE_DNC = self.HIDDEN_SIZE + self.WORD_LENGTH_DNC = 512 + self.CELL_COUNT_DNC = 64 + self.MEM_HIDDEN_SIZE = self.CELL_COUNT_DNC * self.WORD_LENGTH_DNC + self.N_READ_HEADS_DNC = 4 + + def parse_to_dict(self, args): + args_dict = {} + for arg in dir(args): + if not arg.startswith('_') and not isinstance(getattr(args, arg), MethodType): + if getattr(args, arg) is not None: + args_dict[arg] = getattr(args, arg) + + return args_dict + + + def add_args(self, args_dict): + for arg in args_dict: + setattr(self, arg, args_dict[arg]) + + + def proc(self): + assert self.RUN_MODE in ['train', 'val', 'test'] + + # ------------ Devices setup + # os.environ['CUDA_VISIBLE_DEVICES'] = self.GPU + self.N_GPU = len(self.GPU.split(',')) + self.DEVICES = [_ for _ in range(self.N_GPU)] + torch.set_num_threads(2) + + + # ------------ Seed setup + # fix pytorch seed + torch.manual_seed(self.SEED) + if self.N_GPU < 2: + torch.cuda.manual_seed(self.SEED) + else: + torch.cuda.manual_seed_all(self.SEED) + torch.backends.cudnn.deterministic = True + + # fix numpy seed + np.random.seed(self.SEED) + + # fix random seed + random.seed(self.SEED) + + if self.CKPT_PATH is not None: + print('Warning: you are now using CKPT_PATH args, ' + 'CKPT_VERSION and CKPT_EPOCH will not work') + self.CKPT_VERSION = self.CKPT_PATH.split('/')[-1] + '_' + str(random.randint(0, 99999999)) + + + # ------------ Split setup + self.SPLIT['train'] = self.TRAIN_SPLIT + if 'val' in self.SPLIT['train'].split('+') or self.RUN_MODE not in ['train']: + self.EVAL_EVERY_EPOCH = False + + if self.RUN_MODE not in ['test']: + self.TEST_SAVE_PRED = False + + + # ------------ Gradient accumulate setup + assert self.BATCH_SIZE % self.GRAD_ACCU_STEPS == 0 + self.SUB_BATCH_SIZE = int(self.BATCH_SIZE / self.GRAD_ACCU_STEPS) + + # Use a small eval batch will reduce gpu memory usage + self.EVAL_BATCH_SIZE = 32 + + + # ------------ Networks setup + # FeedForwardNet size in every MCA layer + self.FF_SIZE = int(self.HIDDEN_SIZE * 4) + self.FF_MEM_SIZE = int() + + # A pipe line hidden size in attention compute + assert self.HIDDEN_SIZE % self.MULTI_HEAD == 0 + self.HIDDEN_SIZE_HEAD = int(self.HIDDEN_SIZE / self.MULTI_HEAD) + + + def __str__(self): + for attr in dir(self): + if not attr.startswith('__') and not isinstance(getattr(self, attr), MethodType): + print('{ %-17s }->' % attr, getattr(self, attr)) + + return '' + + def check_path(self): + print('Checking dataset ...') + + + if not os.path.exists(self.FRAMES): + print(self.FRAMES + 'NOT EXIST') + exit(-1) + + if not os.path.exists(self.CLIPS): + print(self.CLIPS + 'NOT EXIST') + exit(-1) + + for mode in self.QA_PATH: + if not os.path.exists(self.QA_PATH[mode]): + print(self.QA_PATH[mode] + 'NOT EXIST') + exit(-1) + + print('Finished') + print('') diff --git a/cfgs/fusion_cfgs.yml b/cfgs/fusion_cfgs.yml new file mode 100644 index 0000000..3e8dce6 --- /dev/null +++ b/cfgs/fusion_cfgs.yml @@ -0,0 +1,6 @@ +CONTROLLER_INPUT_SIZE: 512 +CONTROLLER_HIDDEN_SIZE: 512 +CONTROLLER_NUM_LAYERS: 2 +HIDDEN_DIM_COMP: 1024 +OUT_DIM_COMP: 512 +COMP_NUM_LAYERS: 2 diff --git a/cfgs/path_cfgs.py b/cfgs/path_cfgs.py new file mode 100644 index 0000000..1005a11 --- /dev/null +++ b/cfgs/path_cfgs.py @@ -0,0 +1,61 @@ +# -------------------------------------------------------- +# mcan-vqa (Deep Modular Co-Attention Networks) +# Licensed under The MIT License [see LICENSE for details] +# Written by Yuhao Cui https://github.com/cuiyuhao1996 +# -------------------------------------------------------- + +import os + +class PATH: + def __init__(self, EXP_NAME, DATASET_PATH): + # name of the experiment + self.EXP_NAME = EXP_NAME + + # Dataset root path + self.DATASET_PATH = DATASET_PATH + + # Bottom up features root path + self.FRAMES = os.path.join(DATASET_PATH, 'frame_feat/') + self.CLIPS = os.path.join(DATASET_PATH, 'clip_feat/') + + + def init_path(self): + self.QA_PATH = { + 'train': self.DATASET_PATH + 'train_qa.json', + 'val': self.DATASET_PATH + 'val_qa.json', + 'test': self.DATASET_PATH + 'test_qa.json', + } + self.C3D_PATH = self.DATASET_PATH + 'c3d.pickle' + + if self.EXP_NAME not in os.listdir('./'): + os.mkdir('./' + self.EXP_NAME) + os.mkdir('./' + self.EXP_NAME + '/results') + self.RESULT_PATH = './' + self.EXP_NAME + '/results/result_test/' + self.PRED_PATH = './' + self.EXP_NAME + '/results/pred/' + self.CACHE_PATH = './' + self.EXP_NAME + '/results/cache/' + self.LOG_PATH = './' + self.EXP_NAME + '/results/log/' + self.TB_PATH = './' + self.EXP_NAME + '/results/tensorboard/' + self.CKPTS_PATH = './' + self.EXP_NAME + '/ckpts/' + + if 'result_test' not in os.listdir('./' + self.EXP_NAME + '/results'): + os.mkdir('./' + self.EXP_NAME + '/results/result_test/') + + if 'pred' not in os.listdir('./' + self.EXP_NAME + '/results'): + os.mkdir('./' + self.EXP_NAME + '/results/pred/') + + if 'cache' not in os.listdir('./' + self.EXP_NAME + '/results'): + os.mkdir('./' + self.EXP_NAME + '/results/cache') + + if 'log' not in os.listdir('./' + self.EXP_NAME + '/results'): + os.mkdir('./' + self.EXP_NAME + '/results/log') + + if 'tensorboard' not in os.listdir('./' + self.EXP_NAME + '/results'): + os.mkdir('./' + self.EXP_NAME + '/results/tensorboard') + + if 'ckpts' not in os.listdir('./' + self.EXP_NAME): + os.mkdir('./' + self.EXP_NAME + '/ckpts') + + + def check_path(self): + raise NotImplementedError + diff --git a/cfgs/small_model.yml b/cfgs/small_model.yml new file mode 100644 index 0000000..b54e222 --- /dev/null +++ b/cfgs/small_model.yml @@ -0,0 +1,13 @@ +LAYER: 6 +HIDDEN_SIZE: 512 +MEM_HIDDEN_SIZE: 2048 +MULTI_HEAD: 8 +DROPOUT_R: 0.1 +FLAT_MLP_SIZE: 512 +FLAT_GLIMPSES: 1 +FLAT_OUT_SIZE: 1024 +LR_BASE: 0.0001 +LR_DECAY_R: 0.2 +GRAD_ACCU_STEPS: 1 +CKPT_VERSION: 'small' +CKPT_EPOCH: 13 diff --git a/code/.gitkeep b/code/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/code/assets/.gitkeep b/code/assets/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/code/assets/structure.png b/code/assets/structure.png new file mode 100644 index 0000000..fd000e3 Binary files /dev/null and b/code/assets/structure.png differ diff --git a/code/cfgs/.gitkeep b/code/cfgs/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/code/cfgs/base_cfgs.py b/code/cfgs/base_cfgs.py new file mode 100644 index 0000000..a6e8c12 --- /dev/null +++ b/code/cfgs/base_cfgs.py @@ -0,0 +1,267 @@ +# -------------------------------------------------------- +# mcan-vqa (Deep Modular Co-Attention Networks) +# Licensed under The MIT License [see LICENSE for details] +# Written by Yuhao Cui https://github.com/cuiyuhao1996 +# -------------------------------------------------------- + +from cfgs.path_cfgs import PATH + +import os, torch, random +import numpy as np +from types import MethodType + + +class Cfgs(PATH): + def __init__(self, EXP_NAME, DATASET_PATH): + super(Cfgs, self).__init__(EXP_NAME, DATASET_PATH) + + # Set Devices + # If use multi-gpu training, set e.g.'0, 1, 2' instead + self.GPU = '0' + + # Set RNG For CPU And GPUs + self.SEED = random.randint(0, 99999999) + + # ------------------------- + # ---- Version Control ---- + # ------------------------- + + # Define a specific name to start new training + # self.VERSION = 'Anonymous_' + str(self.SEED) + self.VERSION = str(self.SEED) + + # Resume training + self.RESUME = False + + # Used in Resume training and testing + self.CKPT_VERSION = self.VERSION + self.CKPT_EPOCH = 0 + + # Absolutely checkpoint path, 'CKPT_VERSION' and 'CKPT_EPOCH' will be overridden + self.CKPT_PATH = None + + # Print loss every step + self.VERBOSE = True + + + # ------------------------------ + # ---- Data Provider Params ---- + # ------------------------------ + + # {'train', 'val', 'test'} + self.RUN_MODE = 'train' + + # Set True to evaluate offline + self.EVAL_EVERY_EPOCH = True + + # # Define the 'train' 'val' 'test' data split + # # (EVAL_EVERY_EPOCH triggered when set {'train': 'train'}) + # self.SPLIT = { + # 'train': '', + # 'val': 'val', + # 'test': 'test', + # } + # # A external method to set train split + # self.TRAIN_SPLIT = 'train+val+vg' + + # Set True to use pretrained word embedding + # (GloVe: spaCy https://spacy.io/) + self.USE_GLOVE = True + + # Word embedding matrix size + # (token size x WORD_EMBED_SIZE) + self.WORD_EMBED_SIZE = 300 + + # Max length of question sentences + self.MAX_TOKEN = 15 + + # VGG 4096D features + self.FRAME_FEAT_SIZE = 4096 + + # C3D 4096D features + self.CLIP_FEAT_SIZE = 4096 + + self.NUM_ANS = 1000 + + # Default training batch size: 64 + self.BATCH_SIZE = 64 + + # Multi-thread I/O + self.NUM_WORKERS = 8 + + # Use pin memory + # (Warning: pin memory can accelerate GPU loading but may + # increase the CPU memory usage when NUM_WORKS is large) + self.PIN_MEM = True + + # Large model can not training with batch size 64 + # Gradient accumulate can split batch to reduce gpu memory usage + # (Warning: BATCH_SIZE should be divided by GRAD_ACCU_STEPS) + self.GRAD_ACCU_STEPS = 1 + + # Set 'external': use external shuffle method to implement training shuffle + # Set 'internal': use pytorch dataloader default shuffle method + self.SHUFFLE_MODE = 'external' + + + # ------------------------ + # ---- Network Params ---- + # ------------------------ + + # Model deeps + # (Encoder and Decoder will be same deeps) + self.LAYER = 6 + + # Model hidden size + # (512 as default, bigger will be a sharp increase of gpu memory usage) + self.HIDDEN_SIZE = 512 + + # Multi-head number in MCA layers + # (Warning: HIDDEN_SIZE should be divided by MULTI_HEAD) + self.MULTI_HEAD = 8 + + # Dropout rate for all dropout layers + # (dropout can prevent overfitting: [Dropout: a simple way to prevent neural networks from overfitting]) + self.DROPOUT_R = 0.1 + + # MLP size in flatten layers + self.FLAT_MLP_SIZE = 512 + + # Flatten the last hidden to vector with {n} attention glimpses + self.FLAT_GLIMPSES = 1 + self.FLAT_OUT_SIZE = 1024 + + + # -------------------------- + # ---- Optimizer Params ---- + # -------------------------- + + # The base learning rate + self.LR_BASE = 0.0001 + + # Learning rate decay ratio + self.LR_DECAY_R = 0.2 + + # Learning rate decay at {x, y, z...} epoch + self.LR_DECAY_LIST = [10, 12] + + # Max training epoch + self.MAX_EPOCH = 30 + + # Gradient clip + # (default: -1 means not using) + self.GRAD_NORM_CLIP = -1 + + # Adam optimizer betas and eps + self.OPT_BETAS = (0.9, 0.98) + self.OPT_EPS = 1e-9 + self.OPT_WEIGHT_DECAY = 1e-5 + # -------------------------- + # ---- DNC Hyper-Params ---- + # -------------------------- + self.IN_SIZE_DNC = self.HIDDEN_SIZE + self.OUT_SIZE_DNC = self.HIDDEN_SIZE + self.WORD_LENGTH_DNC = 512 + self.CELL_COUNT_DNC = 64 + self.MEM_HIDDEN_SIZE = self.CELL_COUNT_DNC * self.WORD_LENGTH_DNC + self.N_READ_HEADS_DNC = 4 + + def parse_to_dict(self, args): + args_dict = {} + for arg in dir(args): + if not arg.startswith('_') and not isinstance(getattr(args, arg), MethodType): + if getattr(args, arg) is not None: + args_dict[arg] = getattr(args, arg) + + return args_dict + + + def add_args(self, args_dict): + for arg in args_dict: + setattr(self, arg, args_dict[arg]) + + + def proc(self): + assert self.RUN_MODE in ['train', 'val', 'test'] + + # ------------ Devices setup + # os.environ['CUDA_VISIBLE_DEVICES'] = self.GPU + self.N_GPU = len(self.GPU.split(',')) + self.DEVICES = [_ for _ in range(self.N_GPU)] + torch.set_num_threads(2) + + + # ------------ Seed setup + # fix pytorch seed + torch.manual_seed(self.SEED) + if self.N_GPU < 2: + torch.cuda.manual_seed(self.SEED) + else: + torch.cuda.manual_seed_all(self.SEED) + torch.backends.cudnn.deterministic = True + + # fix numpy seed + np.random.seed(self.SEED) + + # fix random seed + random.seed(self.SEED) + + if self.CKPT_PATH is not None: + print('Warning: you are now using CKPT_PATH args, ' + 'CKPT_VERSION and CKPT_EPOCH will not work') + self.CKPT_VERSION = self.CKPT_PATH.split('/')[-1] + '_' + str(random.randint(0, 99999999)) + + + # ------------ Split setup + self.SPLIT['train'] = self.TRAIN_SPLIT + if 'val' in self.SPLIT['train'].split('+') or self.RUN_MODE not in ['train']: + self.EVAL_EVERY_EPOCH = False + + if self.RUN_MODE not in ['test']: + self.TEST_SAVE_PRED = False + + + # ------------ Gradient accumulate setup + assert self.BATCH_SIZE % self.GRAD_ACCU_STEPS == 0 + self.SUB_BATCH_SIZE = int(self.BATCH_SIZE / self.GRAD_ACCU_STEPS) + + # Use a small eval batch will reduce gpu memory usage + self.EVAL_BATCH_SIZE = 32 + + + # ------------ Networks setup + # FeedForwardNet size in every MCA layer + self.FF_SIZE = int(self.HIDDEN_SIZE * 4) + self.FF_MEM_SIZE = int() + + # A pipe line hidden size in attention compute + assert self.HIDDEN_SIZE % self.MULTI_HEAD == 0 + self.HIDDEN_SIZE_HEAD = int(self.HIDDEN_SIZE / self.MULTI_HEAD) + + + def __str__(self): + for attr in dir(self): + if not attr.startswith('__') and not isinstance(getattr(self, attr), MethodType): + print('{ %-17s }->' % attr, getattr(self, attr)) + + return '' + + def check_path(self): + print('Checking dataset ...') + + + if not os.path.exists(self.FRAMES): + print(self.FRAMES + 'NOT EXIST') + exit(-1) + + if not os.path.exists(self.CLIPS): + print(self.CLIPS + 'NOT EXIST') + exit(-1) + + for mode in self.QA_PATH: + if not os.path.exists(self.QA_PATH[mode]): + print(self.QA_PATH[mode] + 'NOT EXIST') + exit(-1) + + print('Finished') + print('') diff --git a/code/cfgs/fusion_cfgs.yml b/code/cfgs/fusion_cfgs.yml new file mode 100644 index 0000000..3e8dce6 --- /dev/null +++ b/code/cfgs/fusion_cfgs.yml @@ -0,0 +1,6 @@ +CONTROLLER_INPUT_SIZE: 512 +CONTROLLER_HIDDEN_SIZE: 512 +CONTROLLER_NUM_LAYERS: 2 +HIDDEN_DIM_COMP: 1024 +OUT_DIM_COMP: 512 +COMP_NUM_LAYERS: 2 diff --git a/code/cfgs/path_cfgs.py b/code/cfgs/path_cfgs.py new file mode 100644 index 0000000..1005a11 --- /dev/null +++ b/code/cfgs/path_cfgs.py @@ -0,0 +1,61 @@ +# -------------------------------------------------------- +# mcan-vqa (Deep Modular Co-Attention Networks) +# Licensed under The MIT License [see LICENSE for details] +# Written by Yuhao Cui https://github.com/cuiyuhao1996 +# -------------------------------------------------------- + +import os + +class PATH: + def __init__(self, EXP_NAME, DATASET_PATH): + # name of the experiment + self.EXP_NAME = EXP_NAME + + # Dataset root path + self.DATASET_PATH = DATASET_PATH + + # Bottom up features root path + self.FRAMES = os.path.join(DATASET_PATH, 'frame_feat/') + self.CLIPS = os.path.join(DATASET_PATH, 'clip_feat/') + + + def init_path(self): + self.QA_PATH = { + 'train': self.DATASET_PATH + 'train_qa.json', + 'val': self.DATASET_PATH + 'val_qa.json', + 'test': self.DATASET_PATH + 'test_qa.json', + } + self.C3D_PATH = self.DATASET_PATH + 'c3d.pickle' + + if self.EXP_NAME not in os.listdir('./'): + os.mkdir('./' + self.EXP_NAME) + os.mkdir('./' + self.EXP_NAME + '/results') + self.RESULT_PATH = './' + self.EXP_NAME + '/results/result_test/' + self.PRED_PATH = './' + self.EXP_NAME + '/results/pred/' + self.CACHE_PATH = './' + self.EXP_NAME + '/results/cache/' + self.LOG_PATH = './' + self.EXP_NAME + '/results/log/' + self.TB_PATH = './' + self.EXP_NAME + '/results/tensorboard/' + self.CKPTS_PATH = './' + self.EXP_NAME + '/ckpts/' + + if 'result_test' not in os.listdir('./' + self.EXP_NAME + '/results'): + os.mkdir('./' + self.EXP_NAME + '/results/result_test/') + + if 'pred' not in os.listdir('./' + self.EXP_NAME + '/results'): + os.mkdir('./' + self.EXP_NAME + '/results/pred/') + + if 'cache' not in os.listdir('./' + self.EXP_NAME + '/results'): + os.mkdir('./' + self.EXP_NAME + '/results/cache') + + if 'log' not in os.listdir('./' + self.EXP_NAME + '/results'): + os.mkdir('./' + self.EXP_NAME + '/results/log') + + if 'tensorboard' not in os.listdir('./' + self.EXP_NAME + '/results'): + os.mkdir('./' + self.EXP_NAME + '/results/tensorboard') + + if 'ckpts' not in os.listdir('./' + self.EXP_NAME): + os.mkdir('./' + self.EXP_NAME + '/ckpts') + + + def check_path(self): + raise NotImplementedError + diff --git a/code/cfgs/small_model.yml b/code/cfgs/small_model.yml new file mode 100644 index 0000000..b54e222 --- /dev/null +++ b/code/cfgs/small_model.yml @@ -0,0 +1,13 @@ +LAYER: 6 +HIDDEN_SIZE: 512 +MEM_HIDDEN_SIZE: 2048 +MULTI_HEAD: 8 +DROPOUT_R: 0.1 +FLAT_MLP_SIZE: 512 +FLAT_GLIMPSES: 1 +FLAT_OUT_SIZE: 1024 +LR_BASE: 0.0001 +LR_DECAY_R: 0.2 +GRAD_ACCU_STEPS: 1 +CKPT_VERSION: 'small' +CKPT_EPOCH: 13 diff --git a/code/core/.gitkeep b/code/core/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/code/core/data/.gitkeep b/code/core/data/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/code/core/data/dataset.py b/code/core/data/dataset.py new file mode 100644 index 0000000..217c5ea --- /dev/null +++ b/code/core/data/dataset.py @@ -0,0 +1,103 @@ +import glob, os, json, pickle +import numpy as np +from collections import defaultdict + +import torch +from torch.utils.data import Dataset +import torchvision.transforms as transforms + +from core.data.utils import tokenize, ans_stat, proc_ques, qlen_to_key, ans_to_key + + +class VideoQA_Dataset(Dataset): + def __init__(self, __C): + super(VideoQA_Dataset, self).__init__() + self.__C = __C + self.ans_size = __C.NUM_ANS + # load raw data + with open(__C.QA_PATH[__C.RUN_MODE], 'r') as f: + self.raw_data = json.load(f) + self.data_size = len(self.raw_data) + + splits = __C.SPLIT[__C.RUN_MODE].split('+') + + frames_list = glob.glob(__C.FRAMES + '*.pt') + clips_list = glob.glob(__C.CLIPS + '*.pt') + if 'msvd' in self.C.DATASET_PATH.lower(): + vid_ids = [int(s.split('/')[-1].split('.')[0][3:]) for s in frames_list] + else: + vid_ids = [int(s.split('/')[-1].split('.')[0][5:]) for s in frames_list] + self.frames_dict = {k: v for (k,v) in zip(vid_ids, frames_list)} + self.clips_dict = {k: v for (k,v) in zip(vid_ids, clips_list)} + del frames_list, clips_list + + q_list = [] + a_list = [] + a_dict = defaultdict(lambda: 0) + for split in ['train', 'val']: + with open(__C.QA_PATH[split], 'r') as f: + qa_data = json.load(f) + for d in qa_data: + q_list.append(d['question']) + a_list = d['answer'] + if d['answer'] not in a_dict: + a_dict[d['answer']] = 1 + else: + a_dict[d['answer']] += 1 + + top_answers = sorted(a_dict, key=a_dict.get, reverse=True) + self.qlen_bins_to_idx = { + '1-3': 0, + '4-8': 1, + '9-15': 2, + } + self.ans_rare_to_idx = { + '0-99': 0, + '100-299': 1, + '300-999': 2, + + } + self.qtypes_to_idx = { + 'what': 0, + 'who': 1, + 'how': 2, + 'when': 3, + 'where': 4, + } + + if __C.RUN_MODE == 'train': + self.ans_list = top_answers[:self.ans_size] + + self.ans_to_ix, self.ix_to_ans = ans_stat(self.ans_list) + + self.token_to_ix, self.pretrained_emb = tokenize(q_list, __C.USE_GLOVE) + self.token_size = self.token_to_ix.__len__() + print('== Question token vocab size:', self.token_size) + + self.idx_to_qtypes = {v: k for (k, v) in self.qtypes_to_idx.items()} + self.idx_to_qlen_bins = {v: k for (k, v) in self.qlen_bins_to_idx.items()} + self.idx_to_ans_rare = {v: k for (k, v) in self.ans_rare_to_idx.items()} + + def __getitem__(self, idx): + sample = self.raw_data[idx] + ques = sample['question'] + q_type = self.qtypes_to_idx[ques.split(' ')[0]] + ques_idx, qlen, _ = proc_ques(ques, self.token_to_ix, self.__C.MAX_TOKEN) + qlen_bin = self.qlen_bins_to_idx[qlen_to_key(qlen)] + + answer = sample['answer'] + answer = self.ans_to_ix.get(answer, np.random.randint(0, high=len(self.ans_list))) + ans_rarity = self.ans_rare_to_idx[ans_to_key(answer)] + + answer_one_hot = torch.zeros(self.ans_size) + answer_one_hot[answer] = 1.0 + + vid_id = sample['video_id'] + frames = torch.load(open(self.frames_dict[vid_id], 'rb')).cpu() + clips = torch.load(open(self.clips_dict[vid_id], 'rb')).cpu() + + return torch.from_numpy(ques_idx).long(), frames, clips, answer_one_hot, torch.tensor(answer).long(), \ + torch.tensor(q_type).long(), torch.tensor(qlen_bin).long(), torch.tensor(ans_rarity).long() + + def __len__(self): + return self.data_size diff --git a/code/core/data/preprocess.py b/code/core/data/preprocess.py new file mode 100644 index 0000000..5ac9616 --- /dev/null +++ b/code/core/data/preprocess.py @@ -0,0 +1,182 @@ +import os +import sys +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +import skvideo.io as skv +import torch +import pickle +from PIL import Image +import tqdm +import numpy as np +from model.C3D import C3D +import json +from torchvision.models import vgg19 +import torchvision.transforms as transforms +import torch.nn as nn +import argparse + + +def _select_frames(path, frame_num): + """Select representative frames for video. + Ignore some frames both at begin and end of video. + Args: + path: Path of video. + Returns: + frames: list of frames. + """ + frames = list() + video_data = skv.vread(path) + total_frames = video_data.shape[0] + # Ignore some frame at begin and end. + for i in np.linspace(0, total_frames, frame_num + 2)[1:frame_num + 1]: + frame_data = video_data[int(i)] + img = Image.fromarray(frame_data) + img = img.resize((224, 224), Image.BILINEAR) + frame_data = np.array(img) + frames.append(frame_data) + return frames + +def _select_clips(path, clip_num): + """Select self.batch_size clips for video. Each clip has 16 frames. + Args: + path: Path of video. + Returns: + clips: list of clips. + """ + clips = list() + # video_info = skvideo.io.ffprobe(path) + video_data = skv.vread(path) + total_frames = video_data.shape[0] + height = video_data[1] + width = video_data.shape[2] + for i in np.linspace(0, total_frames, clip_num + 2)[1:clip_num + 1]: + # Select center frame first, then include surrounding frames + clip_start = int(i) - 8 + clip_end = int(i) + 8 + if clip_start < 0: + clip_end = clip_end - clip_start + clip_start = 0 + if clip_end > total_frames: + clip_start = clip_start - (clip_end - total_frames) + clip_end = total_frames + clip = video_data[clip_start:clip_end] + new_clip = [] + for j in range(16): + frame_data = clip[j] + img = Image.fromarray(frame_data) + img = img.resize((112, 112), Image.BILINEAR) + frame_data = np.array(img) * 1.0 + # frame_data -= self.mean[j] + new_clip.append(frame_data) + clips.append(new_clip) + return clips + +def preprocess_videos(video_dir, frame_num, clip_num): + frames_dir = os.path.join(os.path.dirname(video_dir), 'frames') + os.mkdir(frames_dir) + + clips_dir = os.path.join(os.path.dirname(video_dir), 'clips') + os.mkdir(clips_dir) + + for video_name in tqdm.tqdm(os.listdir(video_dir)): + video_path = os.path.join(video_dir, video_name) + frames = _select_frames(video_path, frame_num) + clips = _select_clips(video_path, clip_num) + + with open(os.path.join(frames_dir, video_name.split('.')[0] + '.pkl'), "wb") as f: + pickle.dump(frames, f, protocol=pickle.HIGHEST_PROTOCOL) + + with open(os.path.join(clips_dir, video_name.split('.')[0] + '.pkl'), "wb") as f: + pickle.dump(clips, f, protocol=pickle.HIGHEST_PROTOCOL) + + +def generate_video_features(path_frames, path_clips, c3d_path): + device = torch.device('cuda:0') + frame_feat_dir = os.path.join(os.path.dirname(path_frames), 'frame_feat') + os.makedirs(frame_feat_dir, exist_ok=True) + + clip_feat_dir = os.path.join(os.path.dirname(path_frames), 'clip_feat') + os.makedirs(clip_feat_dir, exist_ok=True) + + cnn = vgg19(pretrained=True) + in_features = cnn.classifier[-1].in_features + cnn.classifier = nn.Sequential( + *list(cnn.classifier.children())[:-1]) # remove last fc layer + cnn.to(device).eval() + c3d = C3D() + c3d.load_state_dict(torch.load(c3d_path)) + c3d.to(device).eval() + transform = transforms.Compose([transforms.ToTensor(), + transforms.Normalize((0.485, 0.456, 0.406), + (0.229, 0.224, 0.225))]) + for vid_name in tqdm.tqdm(os.listdir(path_frames)): + frame_path = os.path.join(path_frames, vid_name) + clip_path = os.path.join(path_clips, vid_name) + + frames = pickle.load(open(frame_path, 'rb')) + clips = pickle.load(open(clip_path, 'rb')) + + frames = [transform(f) for f in frames] + frame_feat = [] + clip_feat = [] + + for frame in frames: + with torch.no_grad(): + feat = cnn(frame.unsqueeze(0).to(device)) + frame_feat.append(feat) + for clip in clips: + # clip has shape (c x f x h x w) + clip = torch.from_numpy(np.float32(np.array(clip))) + clip = clip.transpose(3, 0) + clip = clip.transpose(3, 1) + clip = clip.transpose(3, 2).unsqueeze(0).to(device) + with torch.no_grad(): + feat = c3d(clip) + clip_feat.append(feat) + frame_feat = torch.cat(frame_feat, dim=0) + clip_feat = torch.cat(clip_feat, dim=0) + + torch.save(frame_feat, os.path.join(frame_feat_dir, vid_name.split('.')[0] + '.pt')) + torch.save(clip_feat, os.path.join(clip_feat_dir, vid_name.split('.')[0] + '.pt')) + +def parse_args(): + ''' + Parse input arguments + ''' + parser = argparse.ArgumentParser(description='Preprocessing Args') + + parser.add_argument('--RAW_VID_PATH', dest='RAW_VID_PATH', + help='The path to the raw videos', + required=True, + type=str) + + parser.add_argument('--FRAMES_OUTPUT_DIR', dest='FRAMES_OUTPUT_DIR', + help='The directory where the processed frames and their features will be stored', + required=True, + type=str) + + parser.add_argument('--CLIPS_OUTPUT_DIR', dest='FRAMES_OUTPUT_DIR', + help='The directory where the processed frames and their features will be stored', + required=True, + type=str) + + parser.add_argument('--C3D_PATH', dest='C3D_PATH', + help='Pretrained C3D path', + required=True, + type=str) + + parser.add_argument('--NUM_SAMPLES', dest='NUM_SAMPLES', + help='The number of frames/clips to be sampled from the video', + default=20, + type=int) + + args = parser.parse_args() + return args + + +if __name__ == '__main__': + args = parse_args() + preprocess_videos(args.RAW_VID_PATH, args.NUM_SAMPLES, args.NUM_SAMPLES) + frames_dir = os.path.join(os.path.dirname(args.RAW_VID_PATH), 'frames') + clips_dir = os.path.join(os.path.dirname(args.RAW_VID_PATH), 'clips') + generate_video_features(frames_dir, clips_dir) diff --git a/code/core/data/utils.py b/code/core/data/utils.py new file mode 100644 index 0000000..44696c5 --- /dev/null +++ b/code/core/data/utils.py @@ -0,0 +1,81 @@ +import en_vectors_web_lg, random, re, json +import numpy as np + +def tokenize(ques_list, use_glove): + token_to_ix = { + 'PAD': 0, + 'UNK': 1, + } + + spacy_tool = None + pretrained_emb = [] + if use_glove: + spacy_tool = en_vectors_web_lg.load() + pretrained_emb.append(spacy_tool('PAD').vector) + pretrained_emb.append(spacy_tool('UNK').vector) + + for ques in ques_list: + words = re.sub( + r"([.,'!?\"()*#:;])", + '', + ques.lower() + ).replace('-', ' ').replace('/', ' ').split() + + for word in words: + if word not in token_to_ix: + token_to_ix[word] = len(token_to_ix) + if use_glove: + pretrained_emb.append(spacy_tool(word).vector) + + pretrained_emb = np.array(pretrained_emb) + + return token_to_ix, pretrained_emb + + +def proc_ques(ques, token_to_ix, max_token): + ques_ix = np.zeros(max_token, np.int64) + + words = re.sub( + r"([.,'!?\"()*#:;])", + '', + ques.lower() + ).replace('-', ' ').replace('/', ' ').split() + q_len = 0 + for ix, word in enumerate(words): + if word in token_to_ix: + ques_ix[ix] = token_to_ix[word] + q_len += 1 + else: + ques_ix[ix] = token_to_ix['UNK'] + + if ix + 1 == max_token: + break + + return ques_ix, q_len, len(words) + +def ans_stat(ans_list): + ans_to_ix, ix_to_ans = {}, {} + for i, ans in enumerate(ans_list): + ans_to_ix[ans] = i + ix_to_ans[i] = ans + + return ans_to_ix, ix_to_ans + +def shuffle_list(ans_list): + random.shuffle(ans_list) + +def qlen_to_key(q_len): + if 1<= q_len <=3: + return '1-3' + if 4<= q_len <=8: + return '4-8' + if 9<= q_len: + return '9-15' + +def ans_to_key(ans_idx): + if 0 <= ans_idx <= 99 : + return '0-99' + if 100 <= ans_idx <= 299 : + return '100-299' + if 300 <= ans_idx <= 999 : + return '300-999' diff --git a/code/core/exec.py b/code/core/exec.py new file mode 100644 index 0000000..0d7cab3 --- /dev/null +++ b/code/core/exec.py @@ -0,0 +1,523 @@ +# -------------------------------------------------------- +# mcan-vqa (Deep Modular Co-Attention Networks) +# Licensed under The MIT License [see LICENSE for details] +# Written by Yuhao Cui https://github.com/cuiyuhao1996 +# -------------------------------------------------------- + +from core.data.dataset import VideoQA_Dataset +from core.model.net import Net1, Net2, Net3, Net4 +from core.model.optim import get_optim, adjust_lr +from core.metrics import get_acc +from tqdm import tqdm +from core.data.utils import shuffle_list + +import os, json, torch, datetime, pickle, copy, shutil, time, math +import numpy as np +import torch.nn as nn +import torch.utils.data as Data +from tensorboardX import SummaryWriter +from torch.autograd import Variable as var + +class Execution: + def __init__(self, __C): + self.__C = __C + print('Loading training set ........') + __C_train = copy.deepcopy(self.__C) + setattr(__C_train, 'RUN_MODE', 'train') + self.dataset = VideoQA_Dataset(__C_train) + + self.dataset_eval = None + if self.__C.EVAL_EVERY_EPOCH: + __C_eval = copy.deepcopy(self.__C) + setattr(__C_eval, 'RUN_MODE', 'val') + + print('Loading validation set for per-epoch evaluation ........') + self.dataset_eval = VideoQA_Dataset(__C_eval) + self.dataset_eval.ans_list = self.dataset.ans_list + self.dataset_eval.ans_to_ix, self.dataset_eval.ix_to_ans = self.dataset.ans_to_ix, self.dataset.ix_to_ans + self.dataset_eval.token_to_ix, self.dataset_eval.pretrained_emb = self.dataset.token_to_ix, self.dataset.pretrained_emb + + __C_test = copy.deepcopy(self.__C) + setattr(__C_test, 'RUN_MODE', 'test') + + self.dataset_test = VideoQA_Dataset(__C_test) + self.dataset_test.ans_list = self.dataset.ans_list + self.dataset_test.ans_to_ix, self.dataset_test.ix_to_ans = self.dataset.ans_to_ix, self.dataset.ix_to_ans + self.dataset_test.token_to_ix, self.dataset_test.pretrained_emb = self.dataset.token_to_ix, self.dataset.pretrained_emb + + self.writer = SummaryWriter(self.__C.TB_PATH) + + def train(self, dataset, dataset_eval=None): + # Obtain needed information + data_size = dataset.data_size + token_size = dataset.token_size + ans_size = dataset.ans_size + pretrained_emb = dataset.pretrained_emb + net = self.construct_net(self.__C.MODEL_TYPE) + if os.path.isfile(self.__C.PRETRAINED_PATH) and self.__C.MODEL_TYPE == 11: + print('Loading pretrained DNC-weigths') + net.load_pretrained_weights() + net.cuda() + net.train() + + # Define the multi-gpu training if needed + if self.__C.N_GPU > 1: + net = nn.DataParallel(net, device_ids=self.__C.DEVICES) + + # Define the binary cross entropy loss + # loss_fn = torch.nn.BCELoss(size_average=False).cuda() + loss_fn = torch.nn.BCELoss(reduction='sum').cuda() + # Load checkpoint if resume training + if self.__C.RESUME: + print(' ========== Resume training') + + if self.__C.CKPT_PATH is not None: + print('Warning: you are now using CKPT_PATH args, ' + 'CKPT_VERSION and CKPT_EPOCH will not work') + + path = self.__C.CKPT_PATH + else: + path = self.__C.CKPTS_PATH + \ + 'ckpt_' + self.__C.CKPT_VERSION + \ + '/epoch' + str(self.__C.CKPT_EPOCH) + '.pkl' + + # Load the network parameters + print('Loading ckpt {}'.format(path)) + ckpt = torch.load(path) + print('Finish!') + net.load_state_dict(ckpt['state_dict']) + + # Load the optimizer paramters + optim = get_optim(self.__C, net, data_size, ckpt['optim'], lr_base=ckpt['lr_base']) + optim._step = int(data_size / self.__C.BATCH_SIZE * self.__C.CKPT_EPOCH) + optim.optimizer.load_state_dict(ckpt['optimizer']) + + start_epoch = self.__C.CKPT_EPOCH + + else: + if ('ckpt_' + self.__C.VERSION) in os.listdir(self.__C.CKPTS_PATH): + shutil.rmtree(self.__C.CKPTS_PATH + 'ckpt_' + self.__C.VERSION) + + os.mkdir(self.__C.CKPTS_PATH + 'ckpt_' + self.__C.VERSION) + + optim = get_optim(self.__C, net, data_size, self.__C.OPTIM) + start_epoch = 0 + + loss_sum = 0 + named_params = list(net.named_parameters()) + grad_norm = np.zeros(len(named_params)) + + # Define multi-thread dataloader + if self.__C.SHUFFLE_MODE in ['external']: + dataloader = Data.DataLoader( + dataset, + batch_size=self.__C.BATCH_SIZE, + shuffle=False, + num_workers=self.__C.NUM_WORKERS, + pin_memory=self.__C.PIN_MEM, + drop_last=True + ) + else: + dataloader = Data.DataLoader( + dataset, + batch_size=self.__C.BATCH_SIZE, + shuffle=True, + num_workers=self.__C.NUM_WORKERS, + pin_memory=self.__C.PIN_MEM, + drop_last=True + ) + + # Training script + for epoch in range(start_epoch, self.__C.MAX_EPOCH): + + # Save log information + logfile = open( + self.__C.LOG_PATH + + 'log_run_' + self.__C.VERSION + '.txt', + 'a+' + ) + logfile.write( + 'nowTime: ' + + datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') + + '\n' + ) + logfile.close() + + # Learning Rate Decay + if epoch in self.__C.LR_DECAY_LIST: + adjust_lr(optim, self.__C.LR_DECAY_R) + + # Externally shuffle + if self.__C.SHUFFLE_MODE == 'external': + shuffle_list(dataset.ans_list) + + time_start = time.time() + # Iteration + for step, ( + ques_ix_iter, + frames_feat_iter, + clips_feat_iter, + ans_iter, + _, + _, + _, + _ + ) in enumerate(dataloader): + + ques_ix_iter = ques_ix_iter.cuda() + frames_feat_iter = frames_feat_iter.cuda() + clips_feat_iter = clips_feat_iter.cuda() + ans_iter = ans_iter.cuda() + + optim.zero_grad() + + for accu_step in range(self.__C.GRAD_ACCU_STEPS): + + sub_frames_feat_iter = \ + frames_feat_iter[accu_step * self.__C.SUB_BATCH_SIZE: + (accu_step + 1) * self.__C.SUB_BATCH_SIZE] + sub_clips_feat_iter = \ + clips_feat_iter[accu_step * self.__C.SUB_BATCH_SIZE: + (accu_step + 1) * self.__C.SUB_BATCH_SIZE] + sub_ques_ix_iter = \ + ques_ix_iter[accu_step * self.__C.SUB_BATCH_SIZE: + (accu_step + 1) * self.__C.SUB_BATCH_SIZE] + sub_ans_iter = \ + ans_iter[accu_step * self.__C.SUB_BATCH_SIZE: + (accu_step + 1) * self.__C.SUB_BATCH_SIZE] + + pred = net( + sub_frames_feat_iter, + sub_clips_feat_iter, + sub_ques_ix_iter + ) + + loss = loss_fn(pred, sub_ans_iter) + + # only mean-reduction needs be divided by grad_accu_steps + # removing this line wouldn't change our results because the speciality of Adam optimizer, + # but would be necessary if you use SGD optimizer. + # loss /= self.__C.GRAD_ACCU_STEPS + # start_backward = time.time() + loss.backward() + + if self.__C.VERBOSE: + if dataset_eval is not None: + mode_str = self.__C.SPLIT['train'] + '->' + self.__C.SPLIT['val'] + else: + mode_str = self.__C.SPLIT['train'] + '->' + self.__C.SPLIT['test'] + + # logging + + self.writer.add_scalar( + 'train/loss', + loss.cpu().data.numpy() / self.__C.SUB_BATCH_SIZE, + global_step=step + epoch * math.ceil(data_size / self.__C.BATCH_SIZE)) + + self.writer.add_scalar( + 'train/lr', + optim._rate, + global_step=step + epoch * math.ceil(data_size / self.__C.BATCH_SIZE)) + + print("\r[exp_name %s][version %s][epoch %2d][step %4d/%4d][%s] loss: %.4f, lr: %.2e" % ( + self.__C.EXP_NAME, + self.__C.VERSION, + epoch + 1, + step, + int(data_size / self.__C.BATCH_SIZE), + mode_str, + loss.cpu().data.numpy() / self.__C.SUB_BATCH_SIZE, + optim._rate, + ), end=' ') + + # Gradient norm clipping + if self.__C.GRAD_NORM_CLIP > 0: + nn.utils.clip_grad_norm_( + net.parameters(), + self.__C.GRAD_NORM_CLIP + ) + + # Save the gradient information + for name in range(len(named_params)): + norm_v = torch.norm(named_params[name][1].grad).cpu().data.numpy() \ + if named_params[name][1].grad is not None else 0 + grad_norm[name] += norm_v * self.__C.GRAD_ACCU_STEPS + + optim.step() + + time_end = time.time() + print('Finished in {}s'.format(int(time_end-time_start))) + + epoch_finish = epoch + 1 + + # Save checkpoint + state = { + 'state_dict': net.state_dict(), + 'optimizer': optim.optimizer.state_dict(), + 'lr_base': optim.lr_base, + 'optim': optim.lr_base, } + + torch.save( + state, + self.__C.CKPTS_PATH + + 'ckpt_' + self.__C.VERSION + + '/epoch' + str(epoch_finish) + + '.pkl' + ) + + # Logging + logfile = open( + self.__C.LOG_PATH + + 'log_run_' + self.__C.VERSION + '.txt', + 'a+' + ) + logfile.write( + 'epoch = ' + str(epoch_finish) + + ' loss = ' + str(loss_sum / data_size) + + '\n' + + 'lr = ' + str(optim._rate) + + '\n\n' + ) + logfile.close() + + # Eval after every epoch + if dataset_eval is not None: + self.eval( + net, + dataset_eval, + self.writer, + epoch, + valid=True, + ) + + loss_sum = 0 + grad_norm = np.zeros(len(named_params)) + + + # Evaluation + def eval(self, net, dataset, writer, epoch, valid=False): + + ans_ix_list = [] + pred_list = [] + q_type_list = [] + q_bin_list = [] + ans_rarity_list = [] + + ans_qtype_dict = {'what': [], 'who': [], 'how': [], 'when': [], 'where': []} + pred_qtype_dict = {'what': [], 'who': [], 'how': [], 'when': [], 'where': []} + + + ans_qlen_bin_dict = {'1-3': [], '4-8': [], '9-15': []} + pred_qlen_bin_dict = {'1-3': [], '4-8': [], '9-15': []} + + ans_ans_rarity_dict = {'0-99': [], '100-299': [], '300-999': []} + pred_ans_rarity_dict = {'0-99': [], '100-299': [], '300-999': []} + + data_size = dataset.data_size + + net.eval() + + if self.__C.N_GPU > 1: + net = nn.DataParallel(net, device_ids=self.__C.DEVICES) + + dataloader = Data.DataLoader( + dataset, + batch_size=self.__C.EVAL_BATCH_SIZE, + shuffle=False, + num_workers=self.__C.NUM_WORKERS, + pin_memory=True + ) + + for step, ( + ques_ix_iter, + frames_feat_iter, + clips_feat_iter, + _, + ans_iter, + q_type, + qlen_bin, + ans_rarity + ) in enumerate(dataloader): + print("\rEvaluation: [step %4d/%4d]" % ( + step, + int(data_size / self.__C.EVAL_BATCH_SIZE), + ), end=' ') + ques_ix_iter = ques_ix_iter.cuda() + frames_feat_iter = frames_feat_iter.cuda() + clips_feat_iter = clips_feat_iter.cuda() + with torch.no_grad(): + + pred = net( + frames_feat_iter, + clips_feat_iter, + ques_ix_iter + ) + + pred_np = pred.cpu().data.numpy() + pred_argmax = np.argmax(pred_np, axis=1) + pred_list.extend(pred_argmax) + ans_ix_list.extend(ans_iter.tolist()) + q_type_list.extend(q_type.tolist()) + q_bin_list.extend(qlen_bin.tolist()) + ans_rarity_list.extend(ans_rarity.tolist()) + + print('') + + assert len(pred_list) == len(ans_ix_list) == len(q_type_list) == len(q_bin_list) == len(ans_rarity_list) + pred_list = [dataset.ix_to_ans[pred] for pred in pred_list] + ans_ix_list = [dataset.ix_to_ans[ans] for ans in ans_ix_list] + + # Run validation script + scores_per_qtype = { + 'what': {}, + 'who': {}, + 'how': {}, + 'when': {}, + 'where': {}, + } + scores_per_qlen_bin = { + '1-3': {}, + '4-8': {}, + '9-15': {}, + } + scores_ans_rarity_dict = { + '0-99': {}, + '100-299': {}, + '300-999': {} + } + + if valid: + # create vqa object and vqaRes object + for pred, ans, q_type in zip(pred_list, ans_ix_list, q_type_list): + pred_qtype_dict[dataset.idx_to_qtypes[q_type]].append(pred) + ans_qtype_dict[dataset.idx_to_qtypes[q_type]].append(ans) + + print('----------------- Computing scores -----------------') + acc = get_acc(ans_ix_list, pred_list) + print('----------------- Overall -----------------') + print('acc: {}'.format(acc)) + writer.add_scalar('acc/overall', acc, global_step=epoch) + + for q_type in scores_per_qtype: + print('----------------- Computing "{}" q-type scores -----------------'.format(q_type)) + # acc, wups_0, wups_1 = get_scores( + # ans_ix_dict[q_type], pred_ix_dict[q_type]) + acc = get_acc(ans_qtype_dict[q_type], pred_qtype_dict[q_type]) + print('acc: {}'.format(acc)) + writer.add_scalar( + 'acc/{}'.format(q_type), acc, global_step=epoch) + else: + for pred, ans, q_type, qlen_bin, a_rarity in zip( + pred_list, ans_ix_list, q_type_list, q_bin_list, ans_rarity_list): + + pred_qtype_dict[dataset.idx_to_qtypes[q_type]].append(pred) + ans_qtype_dict[dataset.idx_to_qtypes[q_type]].append(ans) + + pred_qlen_bin_dict[dataset.idx_to_qlen_bins[qlen_bin]].append(pred) + ans_qlen_bin_dict[dataset.idx_to_qlen_bins[qlen_bin]].append(ans) + + pred_ans_rarity_dict[dataset.idx_to_ans_rare[a_rarity]].append(pred) + ans_ans_rarity_dict[dataset.idx_to_ans_rare[a_rarity]].append(ans) + + print('----------------- Computing overall scores -----------------') + acc = get_acc(ans_ix_list, pred_list) + + print('----------------- Overall -----------------') + print('acc:{}'.format(acc)) + + + print('----------------- Computing q-type scores -----------------') + for q_type in scores_per_qtype: + acc = get_acc(ans_qtype_dict[q_type], pred_qtype_dict[q_type]) + print(' {} '.format(q_type)) + print('acc:{}'.format(acc)) + + print('----------------- Computing qlen-bins scores -----------------') + for qlen_bin in scores_per_qlen_bin: + + acc = get_acc(ans_qlen_bin_dict[qlen_bin], pred_qlen_bin_dict[qlen_bin]) + print(' {} '.format(qlen_bin)) + print('acc:{}'.format(acc)) + + print('----------------- Computing ans-rarity scores -----------------') + for a_rarity in scores_ans_rarity_dict: + acc = get_acc(ans_ans_rarity_dict[a_rarity], pred_ans_rarity_dict[a_rarity]) + print(' {} '.format(a_rarity)) + print('acc:{}'.format(acc)) + net.train() + + def construct_net(self, model_type): + if model_type == 1: + net = Net1( + self.__C, + self.dataset.pretrained_emb, + self.dataset.token_size, + self.dataset.ans_size + ) + elif model_type == 2: + net = Net2( + self.__C, + self.dataset.pretrained_emb, + self.dataset.token_size, + self.dataset.ans_size + ) + elif model_type == 3: + net = Net3( + self.__C, + self.dataset.pretrained_emb, + self.dataset.token_size, + self.dataset.ans_size + ) + elif model_type == 4: + net = Net4( + self.__C, + self.dataset.pretrained_emb, + self.dataset.token_size, + self.dataset.ans_size + ) + else: + raise ValueError('Net{} is not supported'.format(model_type)) + return net + + def run(self, run_mode, epoch=None): + self.set_seed(self.__C.SEED) + if run_mode == 'train': + self.empty_log(self.__C.VERSION) + self.train(self.dataset, self.dataset_eval) + + elif run_mode == 'val': + self.eval(self.dataset, valid=True) + + elif run_mode == 'test': + net = self.construct_net(self.__C.MODEL_TYPE) + assert epoch is not None + path = self.__C.CKPTS_PATH + \ + 'ckpt_' + self.__C.VERSION + \ + '/epoch' + str(epoch) + '.pkl' + print('Loading ckpt {}'.format(path)) + state_dict = torch.load(path)['state_dict'] + net.load_state_dict(state_dict) + net.cuda() + self.eval(net, self.dataset_test, self.writer, 0) + + else: + exit(-1) + + def set_seed(self, seed): + """Sets the seed for reproducibility. + Args: + seed (int): The seed used + """ + torch.manual_seed(seed) + torch.cuda.manual_seed(seed) + torch.backends.cudnn.deterministic = True + torch.backends.cudnn.benchmark = False + np.random.seed(seed) + print('\nSeed set to {}...\n'.format(seed)) + + def empty_log(self, version): + print('Initializing log file ........') + if (os.path.exists(self.__C.LOG_PATH + 'log_run_' + version + '.txt')): + os.remove(self.__C.LOG_PATH + 'log_run_' + version + '.txt') + print('Finished!') + print('') diff --git a/code/core/metrics.py b/code/core/metrics.py new file mode 100644 index 0000000..0a45f1f --- /dev/null +++ b/code/core/metrics.py @@ -0,0 +1,211 @@ +""" +Author: Mateusz Malinowski +Email: mmalinow@mpi-inf.mpg.de + +The script assumes there are two files +- first file with ground truth answers +- second file with predicted answers +both answers are line-aligned + +The script also assumes that answer items are comma separated. +For instance, chair,table,window + +It is also a set measure, so not exactly the same as accuracy +even if dirac measure is used since {book,book}=={book}, also {book,chair}={chair,book} + +Logs: + 05.09.2015 - white spaces surrounding words are stripped away so that {book, chair}={book,chair} +""" + +import sys + +#import enchant + +from numpy import prod +from nltk.corpus import wordnet as wn +from tqdm import tqdm + +def file2list(filepath): + with open(filepath,'r') as f: + lines =[k for k in + [k.strip() for k in f.readlines()] + if len(k) > 0] + + return lines + + +def list2file(filepath,mylist): + mylist='\n'.join(mylist) + with open(filepath,'w') as f: + f.writelines(mylist) + + +def items2list(x): + """ + x - string of comma-separated answer items + """ + return [l.strip() for l in x.split(',')] + + +def fuzzy_set_membership_measure(x,A,m): + """ + Set membership measure. + x: element + A: set of elements + m: point-wise element-to-element measure m(a,b) ~ similarity(a,b) + + This function implments a fuzzy set membership measure: + m(x \in A) = max_{a \in A} m(x,a)} + """ + return 0 if A==[] else max(map(lambda a: m(x,a), A)) + + +def score_it(A,T,m): + """ + A: list of A items + T: list of T items + m: set membership measure + m(a \in A) gives a membership quality of a into A + + This function implements a fuzzy accuracy score: + score(A,T) = min{prod_{a \in A} m(a \in T), prod_{t \in T} m(a \in A)} + where A and T are set representations of the answers + and m is a measure + """ + if A==[] and T==[]: + return 1 + + # print A,T + + score_left=0 if A==[] else prod(list(map(lambda a: m(a,T), A))) + score_right=0 if T==[] else prod(list(map(lambda t: m(t,A),T))) + return min(score_left,score_right) + + +# implementations of different measure functions +def dirac_measure(a,b): + """ + Returns 1 iff a=b and 0 otherwise. + """ + if a==[] or b==[]: + return 0.0 + return float(a==b) + + +def wup_measure(a,b,similarity_threshold=0.925): + """ + Returns Wu-Palmer similarity score. + More specifically, it computes: + max_{x \in interp(a)} max_{y \in interp(b)} wup(x,y) + where interp is a 'interpretation field' + """ + def get_semantic_field(a): + weight = 1.0 + semantic_field = wn.synsets(a,pos=wn.NOUN) + return (semantic_field,weight) + + + def get_stem_word(a): + """ + Sometimes answer has form word\d+:wordid. + If so we return word and downweight + """ + weight = 1.0 + return (a,weight) + + + global_weight=1.0 + + (a,global_weight_a)=get_stem_word(a) + (b,global_weight_b)=get_stem_word(b) + global_weight = min(global_weight_a,global_weight_b) + + if a==b: + # they are the same + return 1.0*global_weight + + if a==[] or b==[]: + return 0 + + + interp_a,weight_a = get_semantic_field(a) + interp_b,weight_b = get_semantic_field(b) + + if interp_a == [] or interp_b == []: + return 0 + + # we take the most optimistic interpretation + global_max=0.0 + for x in interp_a: + for y in interp_b: + local_score=x.wup_similarity(y) + if local_score > global_max: + global_max=local_score + + # we need to use the semantic fields and therefore we downweight + # unless the score is high which indicates both are synonyms + if global_max < similarity_threshold: + interp_weight = 0.1 + else: + interp_weight = 1.0 + + final_score=global_max*weight_a*weight_b*interp_weight*global_weight + return final_score +### + + +def get_scores(input_gt, input_pred, threshold_0=0.0, threshold_1=0.9): + element_membership_acc=dirac_measure + element_membership_wups_0=lambda x,y: wup_measure(x,y,threshold_0) + element_membership_wups_1=lambda x,y: wup_measure(x,y,threshold_1) + + set_membership_acc=\ + lambda x,A: fuzzy_set_membership_measure(x,A,element_membership_acc) + set_membership_wups_0=\ + lambda x,A: fuzzy_set_membership_measure(x,A,element_membership_wups_0) + set_membership_wups_1=\ + lambda x,A: fuzzy_set_membership_measure(x,A,element_membership_wups_1) + + score_list_acc = [] + score_list_wups_0 = [] + score_list_wups_1 = [] + pbar = tqdm(zip(input_gt,input_pred)) + pbar.set_description('Computing Acc') + + for (ta,pa) in pbar: + score_list_acc.append(score_it(items2list(ta),items2list(pa),set_membership_acc)) + + #final_score=sum(map(lambda x:float(x)/float(len(score_list)),score_list)) + final_score_acc=float(sum(score_list_acc))/float(len(score_list_acc)) + final_score_acc *= 100.0 + + pbar = tqdm(zip(input_gt,input_pred)) + pbar.set_description('Computing Wups_0.0') + for (ta,pa) in pbar: + score_list_wups_0.append(score_it(items2list(ta),items2list(pa),set_membership_wups_0)) + #final_score=sum(map(lambda x:float(x)/float(len(score_list)),score_list)) + final_score_wups_0=float(sum(score_list_wups_0))/float(len(score_list_wups_0)) + final_score_wups_0 *= 100.0 + + pbar = tqdm(zip(input_gt,input_pred)) + pbar.set_description('Computing Wups_0.9') + for (ta,pa) in pbar: + score_list_wups_1.append(score_it(items2list(ta),items2list(pa),set_membership_wups_1)) + #final_score=sum(map(lambda x:float(x)/float(len(score_list)),score_list)) + final_score_wups_1=float(sum(score_list_wups_1))/float(len(score_list_wups_1)) + final_score_wups_1 *= 100.0 + + # filtering to obtain the results + #print 'full score:', score_list + # print('accuracy = {0:.2f} | WUPS@{1} = {2:.2f} | WUPS@{3} = {4:.2f}'.format( + # final_score_acc, threshold_0, final_score_wups_0, threshold_1, final_score_wups_1)) + return final_score_acc, final_score_wups_0, final_score_wups_1 + +def get_acc(gts, preds): + sum_correct = 0 + assert len(gts) == len(preds) + for gt, pred in zip(gts, preds): + if gt == pred: + sum_correct += 1 + acc = 100.0 * float(sum_correct/ len(gts)) + return acc diff --git a/code/core/model/.gitkeep b/code/core/model/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/code/core/model/C3D.py b/code/core/model/C3D.py new file mode 100644 index 0000000..198b002 --- /dev/null +++ b/code/core/model/C3D.py @@ -0,0 +1,80 @@ +""" +from https://github.com/DavideA/c3d-pytorch/blob/master/C3D_model.py +""" + + +import torch.nn as nn + + +class C3D(nn.Module): + """ + The C3D network as described in [1]. + """ + + def __init__(self): + super(C3D, self).__init__() + + self.conv1 = nn.Conv3d(3, 64, kernel_size=(3, 3, 3), padding=(1, 1, 1)) + self.pool1 = nn.MaxPool3d(kernel_size=(1, 2, 2), stride=(1, 2, 2)) + + self.conv2 = nn.Conv3d(64, 128, kernel_size=(3, 3, 3), padding=(1, 1, 1)) + self.pool2 = nn.MaxPool3d(kernel_size=(2, 2, 2), stride=(2, 2, 2)) + + self.conv3a = nn.Conv3d(128, 256, kernel_size=(3, 3, 3), padding=(1, 1, 1)) + self.conv3b = nn.Conv3d(256, 256, kernel_size=(3, 3, 3), padding=(1, 1, 1)) + self.pool3 = nn.MaxPool3d(kernel_size=(2, 2, 2), stride=(2, 2, 2)) + + self.conv4a = nn.Conv3d(256, 512, kernel_size=(3, 3, 3), padding=(1, 1, 1)) + self.conv4b = nn.Conv3d(512, 512, kernel_size=(3, 3, 3), padding=(1, 1, 1)) + self.pool4 = nn.MaxPool3d(kernel_size=(2, 2, 2), stride=(2, 2, 2)) + + self.conv5a = nn.Conv3d(512, 512, kernel_size=(3, 3, 3), padding=(1, 1, 1)) + self.conv5b = nn.Conv3d(512, 512, kernel_size=(3, 3, 3), padding=(1, 1, 1)) + self.pool5 = nn.MaxPool3d(kernel_size=(2, 2, 2), stride=(2, 2, 2), padding=(0, 1, 1)) + + self.fc6 = nn.Linear(8192, 4096) + self.fc7 = nn.Linear(4096, 4096) + self.fc8 = nn.Linear(4096, 487) + + self.dropout = nn.Dropout(p=0.5) + + self.relu = nn.ReLU() + self.softmax = nn.Softmax() + + def forward(self, x): + + h = self.relu(self.conv1(x)) + h = self.pool1(h) + + h = self.relu(self.conv2(h)) + h = self.pool2(h) + + h = self.relu(self.conv3a(h)) + h = self.relu(self.conv3b(h)) + h = self.pool3(h) + + h = self.relu(self.conv4a(h)) + h = self.relu(self.conv4b(h)) + h = self.pool4(h) + + h = self.relu(self.conv5a(h)) + h = self.relu(self.conv5b(h)) + h = self.pool5(h) + + h = h.view(-1, 8192) + h = self.relu(self.fc6(h)) + h = self.dropout(h) + h = self.relu(self.fc7(h)) + # h = self.dropout(h) + + # logits = self.fc8(h) + # probs = self.softmax(logits) + + return h + +""" +References +---------- +[1] Tran, Du, et al. "Learning spatiotemporal features with 3d convolutional networks." +Proceedings of the IEEE international conference on computer vision. 2015. +""" diff --git a/code/core/model/dnc.py b/code/core/model/dnc.py new file mode 100644 index 0000000..9fe8fd0 --- /dev/null +++ b/code/core/model/dnc.py @@ -0,0 +1,323 @@ +""" +PyTorch DNC implementation from +--> +https://github.com/ixaxaar/pytorch-dnc +<-- +""" +# -*- coding: utf-8 -*- + + +import torch.nn as nn +import torch as T +from torch.autograd import Variable as var +import numpy as np + +from torch.nn.utils.rnn import pad_packed_sequence as pad +from torch.nn.utils.rnn import pack_padded_sequence as pack +from torch.nn.utils.rnn import PackedSequence + +from .util import * +from .memory import * + +from torch.nn.init import orthogonal_, xavier_uniform_ + + +class DNC(nn.Module): + + def __init__( + self, + input_size, + hidden_size, + rnn_type='lstm', + num_layers=1, + num_hidden_layers=2, + bias=True, + batch_first=True, + dropout=0, + bidirectional=False, + nr_cells=5, + read_heads=2, + cell_size=10, + nonlinearity='tanh', + gpu_id=-1, + independent_linears=False, + share_memory=True, + debug=False, + clip=20 + ): + super(DNC, self).__init__() + # todo: separate weights and RNNs for the interface and output vectors + + self.input_size = input_size + self.hidden_size = hidden_size + self.rnn_type = rnn_type + self.num_layers = num_layers + self.num_hidden_layers = num_hidden_layers + self.bias = bias + self.batch_first = batch_first + self.dropout = dropout + self.bidirectional = bidirectional + self.nr_cells = nr_cells + self.read_heads = read_heads + self.cell_size = cell_size + self.nonlinearity = nonlinearity + self.gpu_id = gpu_id + self.independent_linears = independent_linears + self.share_memory = share_memory + self.debug = debug + self.clip = clip + + self.w = self.cell_size + self.r = self.read_heads + + self.read_vectors_size = self.r * self.w + self.output_size = self.hidden_size + + self.nn_input_size = self.input_size + self.read_vectors_size + self.nn_output_size = self.output_size + self.read_vectors_size + + self.rnns = [] + self.memories = [] + + for layer in range(self.num_layers): + if self.rnn_type.lower() == 'rnn': + self.rnns.append(nn.RNN((self.nn_input_size if layer == 0 else self.nn_output_size), self.output_size, + bias=self.bias, nonlinearity=self.nonlinearity, batch_first=True, dropout=self.dropout, num_layers=self.num_hidden_layers)) + elif self.rnn_type.lower() == 'gru': + self.rnns.append(nn.GRU((self.nn_input_size if layer == 0 else self.nn_output_size), + self.output_size, bias=self.bias, batch_first=True, dropout=self.dropout, num_layers=self.num_hidden_layers)) + if self.rnn_type.lower() == 'lstm': + self.rnns.append(nn.LSTM((self.nn_input_size if layer == 0 else self.nn_output_size), + self.output_size, bias=self.bias, batch_first=True, dropout=self.dropout, num_layers=self.num_hidden_layers)) + setattr(self, self.rnn_type.lower() + '_layer_' + str(layer), self.rnns[layer]) + + # memories for each layer + if not self.share_memory: + self.memories.append( + Memory( + input_size=self.output_size, + mem_size=self.nr_cells, + cell_size=self.w, + read_heads=self.r, + gpu_id=self.gpu_id, + independent_linears=self.independent_linears + ) + ) + setattr(self, 'rnn_layer_memory_' + str(layer), self.memories[layer]) + + # only one memory shared by all layers + if self.share_memory: + self.memories.append( + Memory( + input_size=self.output_size, + mem_size=self.nr_cells, + cell_size=self.w, + read_heads=self.r, + gpu_id=self.gpu_id, + independent_linears=self.independent_linears + ) + ) + setattr(self, 'rnn_layer_memory_shared', self.memories[0]) + + # final output layer + self.output = nn.Linear(self.nn_output_size, self.output_size) + orthogonal_(self.output.weight) + + if self.gpu_id != -1: + [x.cuda(self.gpu_id) for x in self.rnns] + [x.cuda(self.gpu_id) for x in self.memories] + self.output.cuda() + + def _init_hidden(self, hx, batch_size, reset_experience): + # create empty hidden states if not provided + if hx is None: + hx = (None, None, None) + (chx, mhx, last_read) = hx + + # initialize hidden state of the controller RNN + if chx is None: + h = cuda(T.zeros(self.num_hidden_layers, batch_size, self.output_size), gpu_id=self.gpu_id) + xavier_uniform_(h) + + chx = [ (h, h) if self.rnn_type.lower() == 'lstm' else h for x in range(self.num_layers)] + + # Last read vectors + if last_read is None: + last_read = cuda(T.zeros(batch_size, self.w * self.r), gpu_id=self.gpu_id) + + # memory states + if mhx is None: + if self.share_memory: + mhx = self.memories[0].reset(batch_size, erase=reset_experience) + else: + mhx = [m.reset(batch_size, erase=reset_experience) for m in self.memories] + else: + if self.share_memory: + mhx = self.memories[0].reset(batch_size, mhx, erase=reset_experience) + else: + mhx = [m.reset(batch_size, h, erase=reset_experience) for m, h in zip(self.memories, mhx)] + + return chx, mhx, last_read + + def _debug(self, mhx, debug_obj): + if not debug_obj: + debug_obj = { + 'memory': [], + 'link_matrix': [], + 'precedence': [], + 'read_weights': [], + 'write_weights': [], + 'usage_vector': [], + } + + debug_obj['memory'].append(mhx['memory'][0].data.cpu().numpy()) + debug_obj['link_matrix'].append(mhx['link_matrix'][0][0].data.cpu().numpy()) + debug_obj['precedence'].append(mhx['precedence'][0].data.cpu().numpy()) + debug_obj['read_weights'].append(mhx['read_weights'][0].data.cpu().numpy()) + debug_obj['write_weights'].append(mhx['write_weights'][0].data.cpu().numpy()) + debug_obj['usage_vector'].append(mhx['usage_vector'][0].unsqueeze(0).data.cpu().numpy()) + return debug_obj + + def _layer_forward(self, input, layer, hx=(None, None), pass_through_memory=True): + (chx, mhx) = hx + + # pass through the controller layer + input, chx = self.rnns[layer](input.unsqueeze(1), chx) + input = input.squeeze(1) + + # clip the controller output + if self.clip != 0: + output = T.clamp(input, -self.clip, self.clip) + else: + output = input + + # the interface vector + ξ = output + + # pass through memory + if pass_through_memory: + if self.share_memory: + read_vecs, mhx = self.memories[0](ξ, mhx) + else: + read_vecs, mhx = self.memories[layer](ξ, mhx) + # the read vectors + read_vectors = read_vecs.view(-1, self.w * self.r) + else: + read_vectors = None + + return output, (chx, mhx, read_vectors) + + def forward(self, input, hx=(None, None, None), reset_experience=False, pass_through_memory=True): + # handle packed data + is_packed = type(input) is PackedSequence + if is_packed: + input, lengths = pad(input) + max_length = lengths[0] + else: + max_length = input.size(1) if self.batch_first else input.size(0) + lengths = [input.size(1)] * max_length if self.batch_first else [input.size(0)] * max_length + + batch_size = input.size(0) if self.batch_first else input.size(1) + + if not self.batch_first: + input = input.transpose(0, 1) + # make the data time-first + + controller_hidden, mem_hidden, last_read = self._init_hidden(hx, batch_size, reset_experience) + + # concat input with last read (or padding) vectors + inputs = [T.cat([input[:, x, :], last_read], 1) for x in range(max_length)] + + # batched forward pass per element / word / etc + if self.debug: + viz = None + + outs = [None] * max_length + read_vectors = None + rv = [None] * max_length + # pass through time + for time in range(max_length): + # pass thorugh layers + for layer in range(self.num_layers): + # this layer's hidden states + chx = controller_hidden[layer] + m = mem_hidden if self.share_memory else mem_hidden[layer] + # pass through controller + outs[time], (chx, m, read_vectors) = \ + self._layer_forward(inputs[time], layer, (chx, m), pass_through_memory) + + # debug memory + if self.debug: + viz = self._debug(m, viz) + + # store the memory back (per layer or shared) + if self.share_memory: + mem_hidden = m + else: + mem_hidden[layer] = m + controller_hidden[layer] = chx + + if read_vectors is not None: + # the controller output + read vectors go into next layer + outs[time] = T.cat([outs[time], read_vectors], 1) + if layer == self.num_layers - 1: + rv[time] = read_vectors.reshape(batch_size, self.r, self.w) + else: + outs[time] = T.cat([outs[time], last_read], 1) + inputs[time] = outs[time] + + if self.debug: + viz = {k: np.array(v) for k, v in viz.items()} + viz = {k: v.reshape(v.shape[0], v.shape[1] * v.shape[2]) for k, v in viz.items()} + + # pass through final output layer + inputs = [self.output(i) for i in inputs] + outputs = T.stack(inputs, 1 if self.batch_first else 0) + + if is_packed: + outputs = pack(output, lengths) + + if self.debug: + return outputs, (controller_hidden, mem_hidden, read_vectors), rv, viz + else: + return outputs, (controller_hidden, mem_hidden, read_vectors), rv + + def __repr__(self): + s = "\n----------------------------------------\n" + s += '{name}({input_size}, {hidden_size}' + if self.rnn_type != 'lstm': + s += ', rnn_type={rnn_type}' + if self.num_layers != 1: + s += ', num_layers={num_layers}' + if self.num_hidden_layers != 2: + s += ', num_hidden_layers={num_hidden_layers}' + if self.bias != True: + s += ', bias={bias}' + if self.batch_first != True: + s += ', batch_first={batch_first}' + if self.dropout != 0: + s += ', dropout={dropout}' + if self.bidirectional != False: + s += ', bidirectional={bidirectional}' + if self.nr_cells != 5: + s += ', nr_cells={nr_cells}' + if self.read_heads != 2: + s += ', read_heads={read_heads}' + if self.cell_size != 10: + s += ', cell_size={cell_size}' + if self.nonlinearity != 'tanh': + s += ', nonlinearity={nonlinearity}' + if self.gpu_id != -1: + s += ', gpu_id={gpu_id}' + if self.independent_linears != False: + s += ', independent_linears={independent_linears}' + if self.share_memory != True: + s += ', share_memory={share_memory}' + if self.debug != False: + s += ', debug={debug}' + if self.clip != 20: + s += ', clip={clip}' + + s += ")\n" + super(DNC, self).__repr__() + \ + "\n----------------------------------------\n" + return s.format(name=self.__class__.__name__, **self.__dict__) diff --git a/code/core/model/mca.py b/code/core/model/mca.py new file mode 100644 index 0000000..b373287 --- /dev/null +++ b/code/core/model/mca.py @@ -0,0 +1,208 @@ +# -------------------------------------------------------- +# mcan-vqa (Deep Modular Co-Attention Networks) +# Licensed under The MIT License [see LICENSE for details] +# Written by Yuhao Cui https://github.com/cuiyuhao1996 +# -------------------------------------------------------- + +from core.model.net_utils import FC, MLP, LayerNorm +from core.model.dnc_improved import DNC, SharedMemDNC +from core.model.dnc_improved import FeedforwardController +import torch.nn as nn +import torch.nn.functional as F +import torch, math +import time + + +# ------------------------------ +# ---- Multi-Head Attention ---- +# ------------------------------ + +class MHAtt(nn.Module): + def __init__(self, __C): + super(MHAtt, self).__init__() + self.__C = __C + + self.linear_v = nn.Linear(__C.HIDDEN_SIZE, __C.HIDDEN_SIZE) + self.linear_k = nn.Linear(__C.HIDDEN_SIZE, __C.HIDDEN_SIZE) + self.linear_q = nn.Linear(__C.HIDDEN_SIZE, __C.HIDDEN_SIZE) + self.linear_merge = nn.Linear(__C.HIDDEN_SIZE, __C.HIDDEN_SIZE) + + self.dropout = nn.Dropout(__C.DROPOUT_R) + + def forward(self, v, k, q, mask): + n_batches = q.size(0) + + v = self.linear_v(v).view( + n_batches, + -1, + self.__C.MULTI_HEAD, + self.__C.HIDDEN_SIZE_HEAD + ).transpose(1, 2) + + k = self.linear_k(k).view( + n_batches, + -1, + self.__C.MULTI_HEAD, + self.__C.HIDDEN_SIZE_HEAD + ).transpose(1, 2) + + q = self.linear_q(q).view( + n_batches, + -1, + self.__C.MULTI_HEAD, + self.__C.HIDDEN_SIZE_HEAD + ).transpose(1, 2) + + atted = self.att(v, k, q, mask) + atted = atted.transpose(1, 2).contiguous().view( + n_batches, + -1, + self.__C.HIDDEN_SIZE + ) + + atted = self.linear_merge(atted) + + return atted + + def att(self, value, key, query, mask): + d_k = query.size(-1) + + scores = torch.matmul( + query, key.transpose(-2, -1) + ) / math.sqrt(d_k) + + if mask is not None: + scores = scores.masked_fill(mask, -1e9) + + att_map = F.softmax(scores, dim=-1) + att_map = self.dropout(att_map) + + return torch.matmul(att_map, value) + + + +# --------------------------- +# ---- Feed Forward Nets ---- +# --------------------------- + +class FFN(nn.Module): + def __init__(self, __C): + super(FFN, self).__init__() + + self.mlp = MLP( + in_size=__C.HIDDEN_SIZE, + mid_size=__C.FF_SIZE, + out_size=__C.HIDDEN_SIZE, + dropout_r=__C.DROPOUT_R, + use_relu=True + ) + + def forward(self, x): + return self.mlp(x) + + +# ------------------------ +# ---- Self Attention ---- +# ------------------------ + +class SA(nn.Module): + def __init__(self, __C): + super(SA, self).__init__() + self.mhatt = MHAtt(__C) + self.ffn = FFN(__C) + + self.dropout1 = nn.Dropout(__C.DROPOUT_R) + self.norm1 = LayerNorm(__C.HIDDEN_SIZE) + + self.dropout2 = nn.Dropout(__C.DROPOUT_R) + self.norm2 = LayerNorm(__C.HIDDEN_SIZE) + + def forward(self, x, x_mask): + x = self.norm1(x + self.dropout1( + self.mhatt(x, x, x, x_mask) + )) + + x = self.norm2(x + self.dropout2( + self.ffn(x) + )) + + return x + +# ------------------------------- +# ---- Self Guided Attention ---- +# ------------------------------- + +class SGA(nn.Module): + def __init__(self, __C): + super(SGA, self).__init__() + + self.mhatt1 = MHAtt(__C) + self.mhatt2 = MHAtt(__C) + self.ffn = FFN(__C) + + self.dropout1 = nn.Dropout(__C.DROPOUT_R) + self.norm1 = LayerNorm(__C.HIDDEN_SIZE) + + self.dropout2 = nn.Dropout(__C.DROPOUT_R) + self.norm2 = LayerNorm(__C.HIDDEN_SIZE) + + self.dropout3 = nn.Dropout(__C.DROPOUT_R) + self.norm3 = LayerNorm(__C.HIDDEN_SIZE) + + def forward(self, x, y, x_mask, y_mask): + x = self.norm1(x + self.dropout1( + self.mhatt1(x, x, x, x_mask) + )) + + x = self.norm2(x + self.dropout2( + self.mhatt2(y, y, x, y_mask) + )) + + x = self.norm3(x + self.dropout3( + self.ffn(x) + )) + + return x + + +# ------------------------------------------------ +# ---- MAC Layers Cascaded by Encoder-Decoder ---- +# ------------------------------------------------ + +class MCA_ED(nn.Module): + def __init__(self, __C): + super(MCA_ED, self).__init__() + + self.enc_list = nn.ModuleList([SA(__C) for _ in range(__C.LAYER)]) + self.dec_list = nn.ModuleList([SGA(__C) for _ in range(__C.LAYER)]) + + def forward(self, x, y, x_mask, y_mask): + # Get hidden vector + for enc in self.enc_list: + x = enc(x, x_mask) + + for dec in self.dec_list: + y = dec(y, x, y_mask, x_mask) + return x, y + +class VLC(nn.Module): + def __init__(self, __C): + super(VLC, self).__init__() + + self.enc_list = nn.ModuleList([SA(__C) for _ in range(__C.LAYER)]) + self.dec_lang_frames_list = nn.ModuleList([SGA(__C) for _ in range(__C.LAYER)]) + self.dec_lang_clips_list = nn.ModuleList([SGA(__C) for _ in range(__C.LAYER)]) + + + def forward(self, x, y, z, x_mask, y_mask, z_mask): + # Get hidden vector + for enc in self.enc_list: + x = enc(x, x_mask) + + for dec in self.dec_lang_frames_list: + y = dec(y, x, y_mask, x_mask) + + for dec in self.dec_lang_clips_list: + z = dec(z, x, z_mask, x_mask) + return x, y, z + diff --git a/code/core/model/memory.py b/code/core/model/memory.py new file mode 100644 index 0000000..97f1f11 --- /dev/null +++ b/code/core/model/memory.py @@ -0,0 +1,314 @@ +""" +PyTorch DNC implementation from +--> +https://github.com/ixaxaar/pytorch-dnc +<-- +""" +# -*- coding: utf-8 -*- + +import torch.nn as nn +import torch as T +from torch.autograd import Variable as var +import torch.nn.functional as F +import numpy as np + +from core.model.util import * + + +class Memory(nn.Module): + + def __init__(self, input_size, mem_size=512, cell_size=32, read_heads=4, gpu_id=-1, independent_linears=True): + super(Memory, self).__init__() + + self.input_size = input_size + self.mem_size = mem_size + self.cell_size = cell_size + self.read_heads = read_heads + self.gpu_id = gpu_id + self.independent_linears = independent_linears + + m = self.mem_size + w = self.cell_size + r = self.read_heads + + if self.independent_linears: + self.read_keys_transform = nn.Linear(self.input_size, w * r) + self.read_strengths_transform = nn.Linear(self.input_size, r) + self.write_key_transform = nn.Linear(self.input_size, w) + self.write_strength_transform = nn.Linear(self.input_size, 1) + self.erase_vector_transform = nn.Linear(self.input_size, w) + self.write_vector_transform = nn.Linear(self.input_size, w) + self.free_gates_transform = nn.Linear(self.input_size, r) + self.allocation_gate_transform = nn.Linear(self.input_size, 1) + self.write_gate_transform = nn.Linear(self.input_size, 1) + self.read_modes_transform = nn.Linear(self.input_size, 3 * r) + else: + self.interface_size = (w * r) + (3 * w) + (5 * r) + 3 + self.interface_weights = nn.Linear( + self.input_size, self.interface_size) + + self.I = cuda(1 - T.eye(m).unsqueeze(0), + gpu_id=self.gpu_id) # (1 * n * n) + + def reset(self, batch_size=1, hidden=None, erase=True): + m = self.mem_size + w = self.cell_size + r = self.read_heads + b = batch_size + + if hidden is None: + return { + 'memory': cuda(T.zeros(b, m, w).fill_(0), gpu_id=self.gpu_id), + 'link_matrix': cuda(T.zeros(b, 1, m, m), gpu_id=self.gpu_id), + 'precedence': cuda(T.zeros(b, 1, m), gpu_id=self.gpu_id), + 'read_weights': cuda(T.zeros(b, r, m).fill_(0), gpu_id=self.gpu_id), + 'write_weights': cuda(T.zeros(b, 1, m).fill_(0), gpu_id=self.gpu_id), + 'usage_vector': cuda(T.zeros(b, m), gpu_id=self.gpu_id), + # 'free_gates': cuda(T.zeros(b, r), gpu_id=self.gpu_id), + # 'alloc_gates': cuda(T.zeros(b, 1), gpu_id=self.gpu_id), + # 'write_gates': cuda(T.zeros(b, 1), gpu_id=self.gpu_id), + # 'read_modes': cuda(T.zeros(b, r, 3), gpu_id=self.gpu_id) + } + else: + hidden['memory'] = hidden['memory'].clone() + hidden['link_matrix'] = hidden['link_matrix'].clone() + hidden['precedence'] = hidden['precedence'].clone() + hidden['read_weights'] = hidden['read_weights'].clone() + hidden['write_weights'] = hidden['write_weights'].clone() + hidden['usage_vector'] = hidden['usage_vector'].clone() + # hidden['free_gates'] = hidden['free_gates'].clone() + # hidden['alloc_gates'] = hidden['alloc_gates'].clone() + # hidden['write_gates'] = hidden['write_gates'].clone() + # hidden['read_modes'] = hidden['read_modes'].clone() + + if erase: + hidden['memory'].data.fill_(0) + hidden['link_matrix'].data.zero_() + hidden['precedence'].data.zero_() + hidden['read_weights'].data.fill_(0) + hidden['write_weights'].data.fill_(0) + hidden['usage_vector'].data.zero_() + # hidden['free_gates'].data.fill_() + # hidden['alloc_gates'].data.fill_() + # hidden['write_gates'].data.fill_() + # hidden['read_modes'].data.fill_() + + return hidden + + def get_usage_vector(self, usage, free_gates, read_weights, write_weights): + # write_weights = write_weights.detach() # detach from the computation graph + # if read_weights.size(0) > free_gates.size(0): + # read_weights = read_weights[:free_gates.size(0), :, :] + # if usage.size(0) > free_gates.size(0): + # usage = usage[:free_gates.size(0), :] + # if write_weights.size(0) > free_gates.size(0): + # write_weights = write_weights[:free_gates.size(0), :, :] + usage = usage + (1 - usage) * (1 - T.prod(1 - write_weights, 1)) + ψ = T.prod(1 - free_gates.unsqueeze(2) * read_weights, 1) + return usage * ψ + + def allocate(self, usage, write_gate): + # ensure values are not too small prior to cumprod. + usage = δ + (1 - δ) * usage + batch_size = usage.size(0) + # free list + sorted_usage, φ = T.topk(usage, self.mem_size, dim=1, largest=False) + + # cumprod with exclusive=True + # https://discuss.pytorch.org/t/cumprod-exclusive-true-equivalences/2614/8 + v = var(sorted_usage.data.new(batch_size, 1).fill_(1)) + cat_sorted_usage = T.cat((v, sorted_usage), 1) + prod_sorted_usage = T.cumprod(cat_sorted_usage, 1)[:, :-1] + + sorted_allocation_weights = (1 - sorted_usage) * prod_sorted_usage.squeeze() + + # construct the reverse sorting index https://stackoverflow.com/questions/2483696/undo-or-reverse-argsort-python + _, φ_rev = T.topk(φ, k=self.mem_size, dim=1, largest=False) + allocation_weights = sorted_allocation_weights.gather(1, φ_rev.long()) + + return allocation_weights.unsqueeze(1), usage + + def write_weighting(self, memory, write_content_weights, allocation_weights, write_gate, allocation_gate): + ag = allocation_gate.unsqueeze(-1) + wg = write_gate.unsqueeze(-1) + + return wg * (ag * allocation_weights + (1 - ag) * write_content_weights) + + def get_link_matrix(self, link_matrix, write_weights, precedence): + precedence = precedence.unsqueeze(2) + write_weights_i = write_weights.unsqueeze(3) + write_weights_j = write_weights.unsqueeze(2) + + prev_scale = 1 - write_weights_i - write_weights_j + new_link_matrix = write_weights_i * precedence + + link_matrix = prev_scale * link_matrix + new_link_matrix + # trick to delete diag elems + return self.I.expand_as(link_matrix) * link_matrix + + def update_precedence(self, precedence, write_weights): + return (1 - T.sum(write_weights, 2, keepdim=True)) * precedence + write_weights + + def write(self, write_key, write_vector, erase_vector, free_gates, read_strengths, write_strength, write_gate, allocation_gate, hidden): + # get current usage + hidden['usage_vector'] = self.get_usage_vector( + hidden['usage_vector'], + free_gates, + hidden['read_weights'], + hidden['write_weights'] + ) + + # lookup memory with write_key and write_strength + write_content_weights = self.content_weightings( + hidden['memory'], write_key, write_strength) + + # get memory allocation + alloc, _ = self.allocate( + hidden['usage_vector'], + allocation_gate * write_gate + ) + + # get write weightings + hidden['write_weights'] = self.write_weighting( + hidden['memory'], + write_content_weights, + alloc, + write_gate, + allocation_gate + ) + + weighted_resets = hidden['write_weights'].unsqueeze( + 3) * erase_vector.unsqueeze(2) + reset_gate = T.prod(1 - weighted_resets, 1) + # Update memory + hidden['memory'] = hidden['memory'] * reset_gate + + hidden['memory'] = hidden['memory'] + \ + T.bmm(hidden['write_weights'].transpose(1, 2), write_vector) + + # update link_matrix + hidden['link_matrix'] = self.get_link_matrix( + hidden['link_matrix'], + hidden['write_weights'], + hidden['precedence'] + ) + hidden['precedence'] = self.update_precedence( + hidden['precedence'], hidden['write_weights']) + + return hidden + + def content_weightings(self, memory, keys, strengths): + # if memory.size(0) > keys.size(0): + # memory = memory[:keys.size(0), :, :] + d = θ(memory, keys) + return σ(d * strengths.unsqueeze(2), 2) + + def directional_weightings(self, link_matrix, read_weights): + rw = read_weights.unsqueeze(1) + + f = T.matmul(link_matrix, rw.transpose(2, 3)).transpose(2, 3) + b = T.matmul(rw, link_matrix) + return f.transpose(1, 2), b.transpose(1, 2) + + def read_weightings(self, memory, content_weights, link_matrix, read_modes, read_weights): + forward_weight, backward_weight = self.directional_weightings( + link_matrix, read_weights) + + content_mode = read_modes[:, :, 2].contiguous( + ).unsqueeze(2) * content_weights + backward_mode = T.sum( + read_modes[:, :, 0:1].contiguous().unsqueeze(3) * backward_weight, 2) + forward_mode = T.sum( + read_modes[:, :, 1:2].contiguous().unsqueeze(3) * forward_weight, 2) + + return backward_mode + content_mode + forward_mode + + def read_vectors(self, memory, read_weights): + return T.bmm(read_weights, memory) + + def read(self, read_keys, read_strengths, read_modes, hidden): + content_weights = self.content_weightings( + hidden['memory'], read_keys, read_strengths) + + hidden['read_weights'] = self.read_weightings( + hidden['memory'], + content_weights, + hidden['link_matrix'], + read_modes, + hidden['read_weights'] + ) + read_vectors = self.read_vectors( + hidden['memory'], hidden['read_weights']) + return read_vectors, hidden + + def forward(self, ξ, hidden): + + # ξ = ξ.detach() + m = self.mem_size + w = self.cell_size + r = self.read_heads + b = ξ.size()[0] + + if self.independent_linears: + # r read keys (b * r * w) + read_keys = self.read_keys_transform(ξ).view(b, r, w) + # r read strengths (b * r) + read_strengths = F.softplus( + self.read_strengths_transform(ξ).view(b, r)) + # write key (b * 1 * w) + write_key = self.write_key_transform(ξ).view(b, 1, w) + # write strength (b * 1) + write_strength = F.softplus( + self.write_strength_transform(ξ).view(b, 1)) + # erase vector (b * 1 * w) + erase_vector = T.sigmoid( + self.erase_vector_transform(ξ).view(b, 1, w)) + # write vector (b * 1 * w) + write_vector = self.write_vector_transform(ξ).view(b, 1, w) + # r free gates (b * r) + free_gates = T.sigmoid(self.free_gates_transform(ξ).view(b, r)) + # allocation gate (b * 1) + allocation_gate = T.sigmoid( + self.allocation_gate_transform(ξ).view(b, 1)) + # write gate (b * 1) + write_gate = T.sigmoid(self.write_gate_transform(ξ).view(b, 1)) + # read modes (b * r * 3) + read_modes = σ(self.read_modes_transform(ξ).view(b, r, 3), -1) + else: + ξ = self.interface_weights(ξ) + # r read keys (b * w * r) + read_keys = ξ[:, :r * w].contiguous().view(b, r, w) + # r read strengths (b * r) + read_strengths = F.softplus( + ξ[:, r * w:r * w + r].contiguous().view(b, r)) + # write key (b * w * 1) + write_key = ξ[:, r * w + r:r * w + r + w].contiguous().view(b, 1, w) + # write strength (b * 1) + write_strength = F.softplus( + ξ[:, r * w + r + w].contiguous().view(b, 1)) + # erase vector (b * w) + erase_vector = T.sigmoid( + ξ[:, r * w + r + w + 1: r * w + r + 2 * w + 1].contiguous().view(b, 1, w)) + # write vector (b * w) + write_vector = ξ[:, r * w + r + 2 * w + 1: r * w + r + 3 * w + 1].contiguous().view(b, 1, w) + # r free gates (b * r) + free_gates = T.sigmoid( + ξ[:, r * w + r + 3 * w + 1: r * w + 2 * r + 3 * w + 1].contiguous().view(b, r)) + # allocation gate (b * 1) + allocation_gate = T.sigmoid( + ξ[:, r * w + 2 * r + 3 * w + 1].contiguous().unsqueeze(1).view(b, 1)) + # write gate (b * 1) + write_gate = T.sigmoid( + ξ[:, r * w + 2 * r + 3 * w + 2].contiguous()).unsqueeze(1).view(b, 1) + # read modes (b * 3*r) + read_modes = σ(ξ[:, r * w + 2 * r + 3 * w + 3: r * + w + 5 * r + 3 * w + 3].contiguous().view(b, r, 3), -1) + + hidden = self.write(write_key, write_vector, erase_vector, free_gates, + read_strengths, write_strength, write_gate, allocation_gate, hidden) + hidden["free_gates"] = free_gates.clone().detach() + hidden["allocation_gate"] = allocation_gate.clone().detach() + hidden["write_gate"] = write_gate.clone().detach() + hidden["read_modes"] = read_modes.clone().detach() + + return self.read(read_keys, read_strengths, read_modes, hidden) diff --git a/code/core/model/net.py b/code/core/model/net.py new file mode 100644 index 0000000..d8475ee --- /dev/null +++ b/code/core/model/net.py @@ -0,0 +1,501 @@ +# -------------------------------------------------------- +# mcan-vqa (Deep Modular Co-Attention Networks) +# Licensed under The MIT License [see LICENSE for details] +# Written by Yuhao Cui https://github.com/cuiyuhao1996 +# -------------------------------------------------------- + +from core.model.net_utils import FC, MLP, LayerNorm +from core.model.mca import SA, MCA_ED, VLC +from core.model.dnc import DNC + +import torch.nn as nn +import torch.nn.functional as F +import torch + +# ------------------------------ +# ---- Flatten the sequence ---- +# ------------------------------ + +class AttFlat(nn.Module): + def __init__(self, __C): + super(AttFlat, self).__init__() + self.__C = __C + + self.mlp = MLP( + in_size=__C.HIDDEN_SIZE, + mid_size=__C.FLAT_MLP_SIZE, + out_size=__C.FLAT_GLIMPSES, + dropout_r=__C.DROPOUT_R, + use_relu=True + ) + + self.linear_merge = nn.Linear( + __C.HIDDEN_SIZE * __C.FLAT_GLIMPSES, + __C.FLAT_OUT_SIZE + ) + + def forward(self, x, x_mask): + att = self.mlp(x) + att = att.masked_fill( + x_mask.squeeze(1).squeeze(1).unsqueeze(2), + -1e9 + ) + att = F.softmax(att, dim=1) + + att_list = [] + for i in range(self.__C.FLAT_GLIMPSES): + att_list.append( + torch.sum(att[:, :, i: i + 1] * x, dim=1) + ) + + x_atted = torch.cat(att_list, dim=1) + x_atted = self.linear_merge(x_atted) + + return x_atted + +class AttFlatMem(AttFlat): + def __init__(self, __C): + super(AttFlatMem, self).__init__(__C) + self.__C = __C + + def forward(self, x_mem, x, x_mask): + att = self.mlp(x_mem) + att = att.masked_fill( + x_mask.squeeze(1).squeeze(1).unsqueeze(2), + float('-inf') + ) + att = F.softmax(att, dim=1) + att_list = [] + for i in range(self.__C.FLAT_GLIMPSES): + att_list.append( + torch.sum(att[:, :, i: i + 1] * x, dim=1) + ) + x_atted = torch.cat(att_list, dim=1) + x_atted = self.linear_merge(x_atted) + + return x_atted +# ------------------------- +# ---- Main MCAN Model ---- +# ------------------------- + +class Net1(nn.Module): + def __init__(self, __C, pretrained_emb, token_size, answer_size): + super(Net1, self).__init__() + print('Training with Network type 1: VLCN') + self.pretrained_path = __C.PRETRAINED_PATH + self.embedding = nn.Embedding( + num_embeddings=token_size, + embedding_dim=__C.WORD_EMBED_SIZE + ) + + # Loading the GloVe embedding weights + if __C.USE_GLOVE: + self.embedding.weight.data.copy_(torch.from_numpy(pretrained_emb)) + + self.lstm = nn.LSTM( + input_size=__C.WORD_EMBED_SIZE, + hidden_size=__C.HIDDEN_SIZE, + num_layers=1, + batch_first=True + ) + + self.frame_feat_linear = nn.Linear( + __C.FRAME_FEAT_SIZE, + __C.HIDDEN_SIZE + ) + + self.clip_feat_linear = nn.Linear( + __C.CLIP_FEAT_SIZE, + __C.HIDDEN_SIZE + ) + self.backbone = VLC(__C) + + self.attflat_lang = AttFlat(__C) + self.attflat_frame = AttFlat(__C) + self.attflat_clip = AttFlat(__C) + + self.dnc = DNC( + __C.FLAT_OUT_SIZE, + __C.FLAT_OUT_SIZE, + rnn_type='lstm', + num_layers=2, + num_hidden_layers=2, + bias=True, + batch_first=True, + dropout=0, + bidirectional=True, + nr_cells=__C.CELL_COUNT_DNC, + read_heads=__C.N_READ_HEADS_DNC, + cell_size=__C.WORD_LENGTH_DNC, + nonlinearity='tanh', + gpu_id=0, + independent_linears=False, + share_memory=False, + debug=False, + clip=20, + ) + + self.proj_norm = LayerNorm(__C.FLAT_OUT_SIZE) + + self.proj_norm_dnc = LayerNorm(__C.FLAT_OUT_SIZE + __C.N_READ_HEADS_DNC * __C.WORD_LENGTH_DNC) + self.linear_dnc = FC(__C.FLAT_OUT_SIZE + __C.N_READ_HEADS_DNC * __C.WORD_LENGTH_DNC, __C.FLAT_OUT_SIZE, dropout_r=0.2) + self.proj = nn.Linear(__C.FLAT_OUT_SIZE, answer_size) + + def forward(self, frame_feat, clip_feat, ques_ix): + + # Make mask + lang_feat_mask = self.make_mask(ques_ix.unsqueeze(2)) + frame_feat_mask = self.make_mask(frame_feat) + clip_feat_mask = self.make_mask(clip_feat) + + # Pre-process Language Feature + lang_feat = self.embedding(ques_ix) + lang_feat, _ = self.lstm(lang_feat) + + + # Pre-process Video Feature + frame_feat = self.frame_feat_linear(frame_feat) + clip_feat = self.clip_feat_linear(clip_feat) + + # Backbone Framework + lang_feat, frame_feat, clip_feat = self.backbone( + lang_feat, + frame_feat, + clip_feat, + lang_feat_mask, + frame_feat_mask, + clip_feat_mask + ) + + lang_feat = self.attflat_lang( + lang_feat, + lang_feat_mask + ) + + frame_feat = self.attflat_frame( + frame_feat, + frame_feat_mask + ) + + clip_feat = self.attflat_clip( + clip_feat, + clip_feat_mask + ) + proj_feat_0 = lang_feat + frame_feat + clip_feat + proj_feat_0 = self.proj_norm(proj_feat_0) + + proj_feat_1 = torch.stack([lang_feat, frame_feat, clip_feat], dim=1) + proj_feat_1, (_, _, rv), _ = self.dnc(proj_feat_1, (None, None, None), reset_experience=True, pass_through_memory=True) + proj_feat_1 = proj_feat_1.sum(1) + proj_feat_1 = torch.cat([proj_feat_1, rv], dim=-1) + proj_feat_1 = self.proj_norm_dnc(proj_feat_1) + proj_feat_1 = self.linear_dnc(proj_feat_1) + # proj_feat_1 = self.proj_norm(proj_feat_1) + + proj_feat = torch.sigmoid(self.proj(proj_feat_0 + proj_feat_1)) + + return proj_feat + + def load_pretrained_weights(self): + pretrained_msvd = torch.load(self.pretrained_path)['state_dict'] + for n_pretrained, p_pretrained in pretrained_msvd.items(): + if 'dnc' in n_pretrained: + self.state_dict()[n_pretrained].copy_(p_pretrained) + print('Pre-trained dnc-weights successfully loaded!') + + # Masking + def make_mask(self, feature): + return (torch.sum( + torch.abs(feature), + dim=-1 + ) == 0).unsqueeze(1).unsqueeze(2) + +class Net2(nn.Module): + def __init__(self, __C, pretrained_emb, token_size, answer_size): + super(Net2, self).__init__() + print('Training with Network type 2: VLCN-FLF') + self.embedding = nn.Embedding( + num_embeddings=token_size, + embedding_dim=__C.WORD_EMBED_SIZE + ) + # Loading the GloVe embedding weights + if __C.USE_GLOVE: + self.embedding.weight.data.copy_(torch.from_numpy(pretrained_emb)) + + self.lstm = nn.LSTM( + input_size=__C.WORD_EMBED_SIZE, + hidden_size=__C.HIDDEN_SIZE, + num_layers=1, + batch_first=True + ) + + self.frame_feat_linear = nn.Linear( + __C.FRAME_FEAT_SIZE, + __C.HIDDEN_SIZE + ) + + self.clip_feat_linear = nn.Linear( + __C.CLIP_FEAT_SIZE, + __C.HIDDEN_SIZE + ) + self.backbone = VLC(__C) + + self.attflat_lang = AttFlat(__C) + self.attflat_frame = AttFlat(__C) + self.attflat_clip = AttFlat(__C) + + self.proj_norm = LayerNorm(__C.FLAT_OUT_SIZE) + self.proj = nn.Linear(__C.FLAT_OUT_SIZE, answer_size) + + + def forward(self, frame_feat, clip_feat, ques_ix): + + # Make mask + lang_feat_mask = self.make_mask(ques_ix.unsqueeze(2)) + frame_feat_mask = self.make_mask(frame_feat) + clip_feat_mask = self.make_mask(clip_feat) + + # Pre-process Language Feature + lang_feat = self.embedding(ques_ix) + lang_feat, _ = self.lstm(lang_feat) + + + # Pre-process Video Feature + frame_feat = self.frame_feat_linear(frame_feat) + clip_feat = self.clip_feat_linear(clip_feat) + + # Backbone Framework + lang_feat, frame_feat, clip_feat = self.backbone( + lang_feat, + frame_feat, + clip_feat, + lang_feat_mask, + frame_feat_mask, + clip_feat_mask + ) + + lang_feat = self.attflat_lang( + lang_feat, + lang_feat_mask + ) + + frame_feat = self.attflat_frame( + frame_feat, + frame_feat_mask + ) + + clip_feat = self.attflat_clip( + clip_feat, + clip_feat_mask + ) + proj_feat = lang_feat + frame_feat + clip_feat + proj_feat = self.proj_norm(proj_feat) + proj_feat = torch.sigmoid(self.proj(proj_feat)) + + return proj_feat + # Masking + def make_mask(self, feature): + return (torch.sum( + torch.abs(feature), + dim=-1 + ) == 0).unsqueeze(1).unsqueeze(2) + +class Net3(nn.Module): + def __init__(self, __C, pretrained_emb, token_size, answer_size): + super(Net3, self).__init__() + print('Training with Network type 3: VLCN+LSTM') + + self.embedding = nn.Embedding( + num_embeddings=token_size, + embedding_dim=__C.WORD_EMBED_SIZE + ) + + # Loading the GloVe embedding weights + if __C.USE_GLOVE: + self.embedding.weight.data.copy_(torch.from_numpy(pretrained_emb)) + + self.lstm = nn.LSTM( + input_size=__C.WORD_EMBED_SIZE, + hidden_size=__C.HIDDEN_SIZE, + num_layers=1, + batch_first=True + ) + + self.frame_feat_linear = nn.Linear( + __C.FRAME_FEAT_SIZE, + __C.HIDDEN_SIZE + ) + + self.clip_feat_linear = nn.Linear( + __C.CLIP_FEAT_SIZE, + __C.HIDDEN_SIZE + ) + self.backbone = VLC(__C) + + self.attflat_lang = AttFlat(__C) + self.attflat_frame = AttFlat(__C) + self.attflat_clip = AttFlat(__C) + + self.lstm_fusion = nn.LSTM( + input_size=__C.FLAT_OUT_SIZE, + hidden_size=__C.FLAT_OUT_SIZE, + num_layers=2, + batch_first=True, + bidirectional=True + ) + + self.proj_norm = LayerNorm(__C.FLAT_OUT_SIZE) + self.proj_feat_1 = nn.Linear(__C.FLAT_OUT_SIZE * 2, __C.FLAT_OUT_SIZE) + + self.proj_norm_lstm = LayerNorm(__C.FLAT_OUT_SIZE) + self.proj = nn.Linear(__C.FLAT_OUT_SIZE, answer_size) + + def forward(self, frame_feat, clip_feat, ques_ix): + + # Make mask + lang_feat_mask = self.make_mask(ques_ix.unsqueeze(2)) + frame_feat_mask = self.make_mask(frame_feat) + clip_feat_mask = self.make_mask(clip_feat) + + # Pre-process Language Feature + lang_feat = self.embedding(ques_ix) + lang_feat, _ = self.lstm(lang_feat) + + + # Pre-process Video Feature + frame_feat = self.frame_feat_linear(frame_feat) + clip_feat = self.clip_feat_linear(clip_feat) + + # Backbone Framework + lang_feat, frame_feat, clip_feat = self.backbone( + lang_feat, + frame_feat, + clip_feat, + lang_feat_mask, + frame_feat_mask, + clip_feat_mask + ) + + lang_feat = self.attflat_lang( + lang_feat, + lang_feat_mask + ) + + frame_feat = self.attflat_frame( + frame_feat, + frame_feat_mask + ) + + clip_feat = self.attflat_clip( + clip_feat, + clip_feat_mask + ) + proj_feat_0 = lang_feat + frame_feat + clip_feat + proj_feat_0 = self.proj_norm(proj_feat_0) + + proj_feat_1 = torch.stack([lang_feat, frame_feat, clip_feat], dim=1) + proj_feat_1, _ = self.lstm_fusion(proj_feat_1) + proj_feat_1 = proj_feat_1.sum(1) + proj_feat_1 = self.proj_feat_1(proj_feat_1) + proj_feat_1 = self.proj_norm_lstm(proj_feat_1) + + proj_feat = torch.sigmoid(self.proj(proj_feat_0 + proj_feat_1)) + + return proj_feat + + # Masking + def make_mask(self, feature): + return (torch.sum( + torch.abs(feature), + dim=-1 + ) == 0).unsqueeze(1).unsqueeze(2) + +class Net4(nn.Module): + def __init__(self, __C, pretrained_emb, token_size, answer_size): + super(Net4, self).__init__() + print('Training with Network type 4: MCAN') + self.embedding = nn.Embedding( + num_embeddings=token_size, + embedding_dim=__C.WORD_EMBED_SIZE + ) + + # Loading the GloVe embedding weights + if __C.USE_GLOVE: + self.embedding.weight.data.copy_(torch.from_numpy(pretrained_emb)) + + self.lstm = nn.LSTM( + input_size=__C.WORD_EMBED_SIZE, + hidden_size=__C.HIDDEN_SIZE, + num_layers=1, + batch_first=True + ) + + self.frame_feat_linear = nn.Linear( + __C.FRAME_FEAT_SIZE, + __C.HIDDEN_SIZE + ) + + self.clip_feat_linear = nn.Linear( + __C.CLIP_FEAT_SIZE, + __C.HIDDEN_SIZE + ) + self.backbone = MCA_ED(__C) + + self.attflat_lang = AttFlat(__C) + self.attflat_vid = AttFlat(__C) + + self.proj_norm = LayerNorm(__C.FLAT_OUT_SIZE) + self.proj = nn.Linear(__C.FLAT_OUT_SIZE, answer_size) + + + def forward(self, frame_feat, clip_feat, ques_ix): + + # Make mask + lang_feat_mask = self.make_mask(ques_ix.unsqueeze(2)) + frame_feat_mask = self.make_mask(frame_feat) + clip_feat_mask = self.make_mask(clip_feat) + + # Pre-process Language Feature + lang_feat = self.embedding(ques_ix) + lang_feat, _ = self.lstm(lang_feat) + + + # Pre-process Video Feature + frame_feat = self.frame_feat_linear(frame_feat) + clip_feat = self.clip_feat_linear(clip_feat) + + # concat frame and clip features + vid_feat = torch.cat([frame_feat, clip_feat], dim=1) + vid_feat_mask = torch.cat([frame_feat_mask, clip_feat_mask], dim=-1) + # Backbone Framework + lang_feat, vid_feat = self.backbone( + lang_feat, + vid_feat, + lang_feat_mask, + vid_feat_mask, + ) + + lang_feat = self.attflat_lang( + lang_feat, + lang_feat_mask + ) + + vid_feat = self.attflat_vid( + vid_feat, + vid_feat_mask + ) + + proj_feat = lang_feat + vid_feat + proj_feat = self.proj_norm(proj_feat) + proj_feat = torch.sigmoid(self.proj(proj_feat)) + + return proj_feat + + # Masking + def make_mask(self, feature): + return (torch.sum( + torch.abs(feature), + dim=-1 + ) == 0).unsqueeze(1).unsqueeze(2) + + diff --git a/code/core/model/net_utils.py b/code/core/model/net_utils.py new file mode 100644 index 0000000..822edd0 --- /dev/null +++ b/code/core/model/net_utils.py @@ -0,0 +1,62 @@ +# -------------------------------------------------------- +# mcan-vqa (Deep Modular Co-Attention Networks) +# Licensed under The MIT License [see LICENSE for details] +# Written by Yuhao Cui https://github.com/cuiyuhao1996 +# -------------------------------------------------------- + +import torch.nn as nn +import os +import torch + + +class FC(nn.Module): + def __init__(self, in_size, out_size, dropout_r=0., use_relu=True): + super(FC, self).__init__() + self.dropout_r = dropout_r + self.use_relu = use_relu + + self.linear = nn.Linear(in_size, out_size) + + if use_relu: + self.relu = nn.ReLU(inplace=True) + + if dropout_r > 0: + self.dropout = nn.Dropout(dropout_r) + + def forward(self, x): + x = self.linear(x) + + if self.use_relu: + x = self.relu(x) + + if self.dropout_r > 0: + x = self.dropout(x) + + return x + + +class MLP(nn.Module): + def __init__(self, in_size, mid_size, out_size, dropout_r=0., use_relu=True): + super(MLP, self).__init__() + + self.fc = FC(in_size, mid_size, dropout_r=dropout_r, use_relu=use_relu) + self.linear = nn.Linear(mid_size, out_size) + + def forward(self, x): + return self.linear(self.fc(x)) + + +class LayerNorm(nn.Module): + def __init__(self, size, eps=1e-6): + super(LayerNorm, self).__init__() + self.eps = eps + + self.a_2 = nn.Parameter(torch.ones(size)) + self.b_2 = nn.Parameter(torch.zeros(size)) + + def forward(self, x): + mean = x.mean(-1, keepdim=True) + std = x.std(-1, keepdim=True) + + return self.a_2 * (x - mean) / (std + self.eps) + self.b_2 + diff --git a/code/core/model/optim.py b/code/core/model/optim.py new file mode 100644 index 0000000..d01712c --- /dev/null +++ b/code/core/model/optim.py @@ -0,0 +1,98 @@ +# -------------------------------------------------------- +# mcan-vqa (Deep Modular Co-Attention Networks) +# Licensed under The MIT License [see LICENSE for details] +# Written by Yuhao Cui https://github.com/cuiyuhao1996 +# -------------------------------------------------------- + +import torch +import torch.optim as Optim + + +class WarmupOptimizer(object): + def __init__(self, lr_base, optimizer, data_size, batch_size): + self.optimizer = optimizer + self._step = 0 + self.lr_base = lr_base + self._rate = 0 + self.data_size = data_size + self.batch_size = batch_size + + def step(self): + self._step += 1 + + rate = self.rate() + for p in self.optimizer.param_groups: + p['lr'] = rate + self._rate = rate + + self.optimizer.step() + + + def zero_grad(self): + self.optimizer.zero_grad() + + + def rate(self, step=None): + if step is None: + step = self._step + + if step <= int(self.data_size / self.batch_size * 1): + r = self.lr_base * 1/4. + elif step <= int(self.data_size / self.batch_size * 2): + r = self.lr_base * 2/4. + elif step <= int(self.data_size / self.batch_size * 3): + r = self.lr_base * 3/4. + else: + r = self.lr_base + + return r + + +def get_optim(__C, model, data_size, optimizer, lr_base=None): + if lr_base is None: + lr_base = __C.LR_BASE + + # modules = model._modules + # params_list = [] + # for m in modules: + # if 'dnc' in m: + # params_list.append({ + # 'params': filter(lambda p: p.requires_grad, modules[m].parameters()), + # 'lr': __C.LR_DNC_BASE, + # 'flag': True + # }) + # else: + # params_list.append({ + # 'params': filter(lambda p: p.requires_grad, modules[m].parameters()), + + # }) + if optimizer == 'adam': + optim = Optim.Adam( + filter(lambda p: p.requires_grad, model.parameters()), + lr=0, + betas=__C.OPT_BETAS, + eps=__C.OPT_EPS, + + ) + elif optimizer == 'rmsprop': + optim = Optim.RMSprop( + filter(lambda p: p.requires_grad, model.parameters()), + lr=0, + eps=__C.OPT_EPS, + weight_decay=__C.OPT_WEIGHT_DECAY + ) + else: + raise ValueError('{} optimizer is not supported'.fromat(optimizer)) + return WarmupOptimizer( + lr_base, + optim, + data_size, + __C.BATCH_SIZE + ) + + +def adjust_lr(optim, decay_r): + optim.lr_base *= decay_r + +def adjust_lr_dnc(optim, decay_r): + optim.lr_dnc_base *= decay_r diff --git a/code/core/model/utils.py b/code/core/model/utils.py new file mode 100644 index 0000000..8f57508 --- /dev/null +++ b/code/core/model/utils.py @@ -0,0 +1,163 @@ +""" +PyTorch DNC implementation from +--> +https://github.com/ixaxaar/pytorch-dnc +<-- +""" + +import torch.nn as nn +import torch as T +import torch.nn.functional as F +import numpy as np +import torch +from torch.autograd import Variable +import re +import string + + +def recursiveTrace(obj): + print(type(obj)) + if hasattr(obj, 'grad_fn'): + print(obj.grad_fn) + recursiveTrace(obj.grad_fn) + elif hasattr(obj, 'saved_variables'): + print(obj.requires_grad, len(obj.saved_tensors), len(obj.saved_variables)) + [print(v) for v in obj.saved_variables] + [recursiveTrace(v.grad_fn) for v in obj.saved_variables] + + +def cuda(x, grad=False, gpu_id=-1): + x = x.float() if T.is_tensor(x) else x + if gpu_id == -1: + t = T.FloatTensor(x) + t.requires_grad=grad + return t + else: + t = T.FloatTensor(x.pin_memory()).cuda(gpu_id) + t.requires_grad=grad + return t + + +def cudavec(x, grad=False, gpu_id=-1): + if gpu_id == -1: + t = T.Tensor(T.from_numpy(x)) + t.requires_grad = grad + return t + else: + t = T.Tensor(T.from_numpy(x).pin_memory()).cuda(gpu_id) + t.requires_grad = grad + return t + + +def cudalong(x, grad=False, gpu_id=-1): + if gpu_id == -1: + t = T.LongTensor(T.from_numpy(x.astype(np.long))) + t.requires_grad = grad + return t + else: + t = T.LongTensor(T.from_numpy(x.astype(np.long)).pin_memory()).cuda(gpu_id) + t.requires_grad = grad + return t + + +def θ(a, b, normBy=2): + """Batchwise Cosine similarity + Cosine similarity + Arguments: + a {Tensor} -- A 3D Tensor (b * m * w) + b {Tensor} -- A 3D Tensor (b * r * w) + Returns: + Tensor -- Batchwise cosine similarity (b * r * m) + """ + dot = T.bmm(a, b.transpose(1,2)) + a_norm = T.norm(a, normBy, dim=2).unsqueeze(2) + b_norm = T.norm(b, normBy, dim=2).unsqueeze(1) + cos = dot / (a_norm * b_norm + δ) + return cos.transpose(1,2).contiguous() + + +def σ(input, axis=1): + """Softmax on an axis + Softmax on an axis + Arguments: + input {Tensor} -- input Tensor + Keyword Arguments: + axis {number} -- axis on which to take softmax on (default: {1}) + Returns: + Tensor -- Softmax output Tensor + """ + input_size = input.size() + + trans_input = input.transpose(axis, len(input_size) - 1) + trans_size = trans_input.size() + + input_2d = trans_input.contiguous().view(-1, trans_size[-1]) + soft_max_2d = F.softmax(input_2d, -1) + soft_max_nd = soft_max_2d.view(*trans_size) + return soft_max_nd.transpose(axis, len(input_size) - 1) + +δ = 1e-6 + + +def register_nan_checks(model): + def check_grad(module, grad_input, grad_output): + # print(module) you can add this to see that the hook is called + # print('hook called for ' + str(type(module))) + if any(np.all(np.isnan(gi.data.cpu().numpy())) for gi in grad_input if gi is not None): + print('NaN gradient in grad_input ' + type(module).__name__) + + model.apply(lambda module: module.register_backward_hook(check_grad)) + + +def apply_dict(dic): + for k, v in dic.items(): + apply_var(v, k) + if isinstance(v, nn.Module): + key_list = [a for a in dir(v) if not a.startswith('__')] + for key in key_list: + apply_var(getattr(v, key), key) + for pk, pv in v._parameters.items(): + apply_var(pv, pk) + + +def apply_var(v, k): + if isinstance(v, Variable) and v.requires_grad: + v.register_hook(check_nan_gradient(k)) + + +def check_nan_gradient(name=''): + def f(tensor): + if np.isnan(T.mean(tensor).data.cpu().numpy()): + print('\nnan gradient of {} :'.format(name)) + # print(tensor) + # assert 0, 'nan gradient' + return tensor + return f + +def ptr(tensor): + if T.is_tensor(tensor): + return tensor.storage().data_ptr() + elif hasattr(tensor, 'data'): + return tensor.clone().data.storage().data_ptr() + else: + return tensor + +# TODO: EWW change this shit +def ensure_gpu(tensor, gpu_id): + if "cuda" in str(type(tensor)) and gpu_id != -1: + return tensor.cuda(gpu_id) + elif "cuda" in str(type(tensor)): + return tensor.cpu() + elif "Tensor" in str(type(tensor)) and gpu_id != -1: + return tensor.cuda(gpu_id) + elif "Tensor" in str(type(tensor)): + return tensor + elif type(tensor) is np.ndarray: + return cudavec(tensor, gpu_id=gpu_id).data + else: + return tensor + + +def print_gradient(x, name): + s = "Gradient of " + name + " ----------------------------------" + x.register_hook(lambda y: print(s, y.squeeze())) diff --git a/code/requirements.txt b/code/requirements.txt new file mode 100644 index 0000000..78cbad9 --- /dev/null +++ b/code/requirements.txt @@ -0,0 +1,48 @@ +absl-py==0.12.0 +blis==0.7.4 +cachetools==4.2.1 +catalogue==1.0.0 +certifi==2020.12.5 +chardet==4.0.0 +click==7.1.2 +cycler==0.10.0 +cymem==2.0.5 +google-auth==1.28.0 +google-auth-oauthlib==0.4.3 +grpcio==1.36.1 +idna==2.10 +importlib-metadata==3.7.3 +joblib==1.0.1 +Markdown==3.3.4 +mkl-fft==1.3.0 +mkl-random==1.1.1 +mkl-service==2.3.0 +murmurhash==1.0.5 +nltk==3.6.2 +oauthlib==3.1.0 +olefile==0.46 +plac==1.1.3 +positional-encodings==3.0.0 +preshed==3.0.5 +protobuf==3.15.6 +pyasn1==0.4.8 +pyasn1-modules==0.2.8 +PyYAML==5.4.1 +regex==2021.4.4 +requests==2.25.1 +requests-oauthlib==1.3.0 +rsa==4.7.2 +scikit-video==1.1.11 +scipy==1.5.4 +spacy==2.3.5 +srsly==1.0.5 +tensorboard==2.4.1 +tensorboard-plugin-wit==1.8.0 +tensorboardX==2.1 +thinc==7.4.5 +tqdm==4.59.0 +typing-extensions==3.7.4.3 +urllib3==1.26.4 +wasabi==0.8.2 +Werkzeug==1.0.1 +zipp==3.4.1 diff --git a/code/run.py b/code/run.py new file mode 100644 index 0000000..606427f --- /dev/null +++ b/code/run.py @@ -0,0 +1,198 @@ +# -------------------------------------------------------- +# mcan-vqa (Deep Modular Co-Attention Networks) +# Licensed under The MIT License [see LICENSE for details] +# Written by Yuhao Cui https://github.com/cuiyuhao1996 +# -------------------------------------------------------- + +from cfgs.base_cfgs import Cfgs +from core.exec import Execution +import argparse, yaml, os + +def str2bool(v): + if isinstance(v, bool): + return v + if v.lower() in ('yes', 'true', 't', 'y', '1'): + return True + elif v.lower() in ('no', 'false', 'f', 'n', '0'): + return False + else: + raise argparse.ArgumentTypeError('Boolean value expected.') + +def parse_args(): + ''' + Parse input arguments + ''' + parser = argparse.ArgumentParser(description='VLCN Args') + + parser.add_argument('--RUN', dest='RUN_MODE', + default='train', + choices=['train', 'val', 'test'], + help='{train, val, test}', + type=str) # , required=True) + + parser.add_argument('--MODEL', dest='MODEL', + choices=['small', 'large'], + help='{small, large}', + default='small', type=str) + + parser.add_argument('--OPTIM', dest='OPTIM', + choices=['adam', 'rmsprop'], + help='The optimizer', + default='rmsprop', type=str) + + parser.add_argument('--SPLIT', dest='TRAIN_SPLIT', + choices=['train', 'train+val'], + help="set training split, " + "eg.'train', 'train+val'" + "set 'train' can trigger the " + "eval after every epoch", + default='train', + type=str) + + parser.add_argument('--EVAL_EE', dest='EVAL_EVERY_EPOCH', + default=True, + help='set True to evaluate the ' + 'val split when an epoch finished' + "(only work when train with " + "'train' split)", + type=bool) + + parser.add_argument('--SAVE_PRED', dest='TEST_SAVE_PRED', + help='set True to save the ' + 'prediction vectors' + '(only work in testing)', + default=False, + type=bool) + + parser.add_argument('--BS', dest='BATCH_SIZE', + help='batch size during training', + default=64, + type=int) + + parser.add_argument('--MAX_EPOCH', dest='MAX_EPOCH', + default=30, + help='max training epoch', + type=int) + + parser.add_argument('--PRELOAD', dest='PRELOAD', + help='pre-load the features into memory' + 'to increase the I/O speed', + default=False, + type=bool) + + parser.add_argument('--GPU', dest='GPU', + help="gpu select, eg.'0, 1, 2'", + default='0', + type=str) + + parser.add_argument('--SEED', dest='SEED', + help='fix random seed', + default=42, + type=int) + + parser.add_argument('--VERSION', dest='VERSION', + help='version control', + default='1.0.0', + type=str) + + parser.add_argument('--RESUME', dest='RESUME', + default=False, + help='resume training', + type=str2bool) + + parser.add_argument('--CKPT_V', dest='CKPT_VERSION', + help='checkpoint version', + type=str) + + parser.add_argument('--CKPT_E', dest='CKPT_EPOCH', + help='checkpoint epoch', + type=int) + + parser.add_argument('--CKPT_PATH', dest='CKPT_PATH', + help='load checkpoint path, we ' + 'recommend that you use ' + 'CKPT_VERSION and CKPT_EPOCH ' + 'instead', + type=str) + + parser.add_argument('--ACCU', dest='GRAD_ACCU_STEPS', + help='reduce gpu memory usage', + type=int) + + parser.add_argument('--NW', dest='NUM_WORKERS', + help='multithreaded loading', + default=0, + type=int) + + parser.add_argument('--PINM', dest='PIN_MEM', + help='use pin memory', + type=bool) + + parser.add_argument('--VERB', dest='VERBOSE', + help='verbose print', + type=bool) + + parser.add_argument('--DATA_PATH', dest='DATASET_PATH', + default='/projects/abdessaied/data/MSRVTT-QA/', + help='Dataset root path', + type=str) + + parser.add_argument('--EXP_NAME', dest='EXP_NAME', + help='The name of the experiment', + default="test", + type=str) + + parser.add_argument('--DEBUG', dest='DEBUG', + help='Triggeres debug mode: small fractions of the data are loaded ', + default='0', + type=str2bool) + + parser.add_argument('--ENABLE_TIME_MONITORING', dest='ENABLE_TIME_MONITORING', + help='Triggeres time monitoring when training', + default='0', + type=str2bool) + + parser.add_argument('--MODEL_TYPE', dest='MODEL_TYPE', + help='The model type to be used\n 1: VLCN \n 2:VLCN-FLF \n 3: VLCN+LSTM \n 4: MCAN', + default=1, + type=int) + + parser.add_argument('--PRETRAINED_PATH', dest='PRETRAINED_PATH', + help='Pretrained weights on msvd', + default='-', + type=str) + + parser.add_argument('--TEST_EPOCH', dest='TEST_EPOCH', + help='', + default=7, + type=int) + + args = parser.parse_args() + return args + + +if __name__ == '__main__': + args = parse_args() + os.chdir(os.path.dirname(os.path.abspath(__file__))) + __C = Cfgs(args.EXP_NAME, args.DATASET_PATH) + args_dict = __C.parse_to_dict(args) + + cfg_file = "cfgs/{}_model.yml".format(args.MODEL) + with open(cfg_file, 'r') as f: + yaml_dict = yaml.load(f) + + args_dict = {**yaml_dict, **args_dict} + + __C.add_args(args_dict) + __C.proc() + + print('Hyper Parameters:') + print(__C) + + __C.check_path() + os.environ['CUDA_VISIBLE_DEVICES'] = __C.GPU + + execution = Execution(__C) + execution.run(__C.RUN_MODE) + + #execution.run('test', epoch=__C.TEST_EPOCH) diff --git a/core/.gitkeep b/core/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/core/data/.gitkeep b/core/data/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/core/data/dataset.py b/core/data/dataset.py new file mode 100644 index 0000000..217c5ea --- /dev/null +++ b/core/data/dataset.py @@ -0,0 +1,103 @@ +import glob, os, json, pickle +import numpy as np +from collections import defaultdict + +import torch +from torch.utils.data import Dataset +import torchvision.transforms as transforms + +from core.data.utils import tokenize, ans_stat, proc_ques, qlen_to_key, ans_to_key + + +class VideoQA_Dataset(Dataset): + def __init__(self, __C): + super(VideoQA_Dataset, self).__init__() + self.__C = __C + self.ans_size = __C.NUM_ANS + # load raw data + with open(__C.QA_PATH[__C.RUN_MODE], 'r') as f: + self.raw_data = json.load(f) + self.data_size = len(self.raw_data) + + splits = __C.SPLIT[__C.RUN_MODE].split('+') + + frames_list = glob.glob(__C.FRAMES + '*.pt') + clips_list = glob.glob(__C.CLIPS + '*.pt') + if 'msvd' in self.C.DATASET_PATH.lower(): + vid_ids = [int(s.split('/')[-1].split('.')[0][3:]) for s in frames_list] + else: + vid_ids = [int(s.split('/')[-1].split('.')[0][5:]) for s in frames_list] + self.frames_dict = {k: v for (k,v) in zip(vid_ids, frames_list)} + self.clips_dict = {k: v for (k,v) in zip(vid_ids, clips_list)} + del frames_list, clips_list + + q_list = [] + a_list = [] + a_dict = defaultdict(lambda: 0) + for split in ['train', 'val']: + with open(__C.QA_PATH[split], 'r') as f: + qa_data = json.load(f) + for d in qa_data: + q_list.append(d['question']) + a_list = d['answer'] + if d['answer'] not in a_dict: + a_dict[d['answer']] = 1 + else: + a_dict[d['answer']] += 1 + + top_answers = sorted(a_dict, key=a_dict.get, reverse=True) + self.qlen_bins_to_idx = { + '1-3': 0, + '4-8': 1, + '9-15': 2, + } + self.ans_rare_to_idx = { + '0-99': 0, + '100-299': 1, + '300-999': 2, + + } + self.qtypes_to_idx = { + 'what': 0, + 'who': 1, + 'how': 2, + 'when': 3, + 'where': 4, + } + + if __C.RUN_MODE == 'train': + self.ans_list = top_answers[:self.ans_size] + + self.ans_to_ix, self.ix_to_ans = ans_stat(self.ans_list) + + self.token_to_ix, self.pretrained_emb = tokenize(q_list, __C.USE_GLOVE) + self.token_size = self.token_to_ix.__len__() + print('== Question token vocab size:', self.token_size) + + self.idx_to_qtypes = {v: k for (k, v) in self.qtypes_to_idx.items()} + self.idx_to_qlen_bins = {v: k for (k, v) in self.qlen_bins_to_idx.items()} + self.idx_to_ans_rare = {v: k for (k, v) in self.ans_rare_to_idx.items()} + + def __getitem__(self, idx): + sample = self.raw_data[idx] + ques = sample['question'] + q_type = self.qtypes_to_idx[ques.split(' ')[0]] + ques_idx, qlen, _ = proc_ques(ques, self.token_to_ix, self.__C.MAX_TOKEN) + qlen_bin = self.qlen_bins_to_idx[qlen_to_key(qlen)] + + answer = sample['answer'] + answer = self.ans_to_ix.get(answer, np.random.randint(0, high=len(self.ans_list))) + ans_rarity = self.ans_rare_to_idx[ans_to_key(answer)] + + answer_one_hot = torch.zeros(self.ans_size) + answer_one_hot[answer] = 1.0 + + vid_id = sample['video_id'] + frames = torch.load(open(self.frames_dict[vid_id], 'rb')).cpu() + clips = torch.load(open(self.clips_dict[vid_id], 'rb')).cpu() + + return torch.from_numpy(ques_idx).long(), frames, clips, answer_one_hot, torch.tensor(answer).long(), \ + torch.tensor(q_type).long(), torch.tensor(qlen_bin).long(), torch.tensor(ans_rarity).long() + + def __len__(self): + return self.data_size diff --git a/core/data/preprocess.py b/core/data/preprocess.py new file mode 100644 index 0000000..5ac9616 --- /dev/null +++ b/core/data/preprocess.py @@ -0,0 +1,182 @@ +import os +import sys +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +import skvideo.io as skv +import torch +import pickle +from PIL import Image +import tqdm +import numpy as np +from model.C3D import C3D +import json +from torchvision.models import vgg19 +import torchvision.transforms as transforms +import torch.nn as nn +import argparse + + +def _select_frames(path, frame_num): + """Select representative frames for video. + Ignore some frames both at begin and end of video. + Args: + path: Path of video. + Returns: + frames: list of frames. + """ + frames = list() + video_data = skv.vread(path) + total_frames = video_data.shape[0] + # Ignore some frame at begin and end. + for i in np.linspace(0, total_frames, frame_num + 2)[1:frame_num + 1]: + frame_data = video_data[int(i)] + img = Image.fromarray(frame_data) + img = img.resize((224, 224), Image.BILINEAR) + frame_data = np.array(img) + frames.append(frame_data) + return frames + +def _select_clips(path, clip_num): + """Select self.batch_size clips for video. Each clip has 16 frames. + Args: + path: Path of video. + Returns: + clips: list of clips. + """ + clips = list() + # video_info = skvideo.io.ffprobe(path) + video_data = skv.vread(path) + total_frames = video_data.shape[0] + height = video_data[1] + width = video_data.shape[2] + for i in np.linspace(0, total_frames, clip_num + 2)[1:clip_num + 1]: + # Select center frame first, then include surrounding frames + clip_start = int(i) - 8 + clip_end = int(i) + 8 + if clip_start < 0: + clip_end = clip_end - clip_start + clip_start = 0 + if clip_end > total_frames: + clip_start = clip_start - (clip_end - total_frames) + clip_end = total_frames + clip = video_data[clip_start:clip_end] + new_clip = [] + for j in range(16): + frame_data = clip[j] + img = Image.fromarray(frame_data) + img = img.resize((112, 112), Image.BILINEAR) + frame_data = np.array(img) * 1.0 + # frame_data -= self.mean[j] + new_clip.append(frame_data) + clips.append(new_clip) + return clips + +def preprocess_videos(video_dir, frame_num, clip_num): + frames_dir = os.path.join(os.path.dirname(video_dir), 'frames') + os.mkdir(frames_dir) + + clips_dir = os.path.join(os.path.dirname(video_dir), 'clips') + os.mkdir(clips_dir) + + for video_name in tqdm.tqdm(os.listdir(video_dir)): + video_path = os.path.join(video_dir, video_name) + frames = _select_frames(video_path, frame_num) + clips = _select_clips(video_path, clip_num) + + with open(os.path.join(frames_dir, video_name.split('.')[0] + '.pkl'), "wb") as f: + pickle.dump(frames, f, protocol=pickle.HIGHEST_PROTOCOL) + + with open(os.path.join(clips_dir, video_name.split('.')[0] + '.pkl'), "wb") as f: + pickle.dump(clips, f, protocol=pickle.HIGHEST_PROTOCOL) + + +def generate_video_features(path_frames, path_clips, c3d_path): + device = torch.device('cuda:0') + frame_feat_dir = os.path.join(os.path.dirname(path_frames), 'frame_feat') + os.makedirs(frame_feat_dir, exist_ok=True) + + clip_feat_dir = os.path.join(os.path.dirname(path_frames), 'clip_feat') + os.makedirs(clip_feat_dir, exist_ok=True) + + cnn = vgg19(pretrained=True) + in_features = cnn.classifier[-1].in_features + cnn.classifier = nn.Sequential( + *list(cnn.classifier.children())[:-1]) # remove last fc layer + cnn.to(device).eval() + c3d = C3D() + c3d.load_state_dict(torch.load(c3d_path)) + c3d.to(device).eval() + transform = transforms.Compose([transforms.ToTensor(), + transforms.Normalize((0.485, 0.456, 0.406), + (0.229, 0.224, 0.225))]) + for vid_name in tqdm.tqdm(os.listdir(path_frames)): + frame_path = os.path.join(path_frames, vid_name) + clip_path = os.path.join(path_clips, vid_name) + + frames = pickle.load(open(frame_path, 'rb')) + clips = pickle.load(open(clip_path, 'rb')) + + frames = [transform(f) for f in frames] + frame_feat = [] + clip_feat = [] + + for frame in frames: + with torch.no_grad(): + feat = cnn(frame.unsqueeze(0).to(device)) + frame_feat.append(feat) + for clip in clips: + # clip has shape (c x f x h x w) + clip = torch.from_numpy(np.float32(np.array(clip))) + clip = clip.transpose(3, 0) + clip = clip.transpose(3, 1) + clip = clip.transpose(3, 2).unsqueeze(0).to(device) + with torch.no_grad(): + feat = c3d(clip) + clip_feat.append(feat) + frame_feat = torch.cat(frame_feat, dim=0) + clip_feat = torch.cat(clip_feat, dim=0) + + torch.save(frame_feat, os.path.join(frame_feat_dir, vid_name.split('.')[0] + '.pt')) + torch.save(clip_feat, os.path.join(clip_feat_dir, vid_name.split('.')[0] + '.pt')) + +def parse_args(): + ''' + Parse input arguments + ''' + parser = argparse.ArgumentParser(description='Preprocessing Args') + + parser.add_argument('--RAW_VID_PATH', dest='RAW_VID_PATH', + help='The path to the raw videos', + required=True, + type=str) + + parser.add_argument('--FRAMES_OUTPUT_DIR', dest='FRAMES_OUTPUT_DIR', + help='The directory where the processed frames and their features will be stored', + required=True, + type=str) + + parser.add_argument('--CLIPS_OUTPUT_DIR', dest='FRAMES_OUTPUT_DIR', + help='The directory where the processed frames and their features will be stored', + required=True, + type=str) + + parser.add_argument('--C3D_PATH', dest='C3D_PATH', + help='Pretrained C3D path', + required=True, + type=str) + + parser.add_argument('--NUM_SAMPLES', dest='NUM_SAMPLES', + help='The number of frames/clips to be sampled from the video', + default=20, + type=int) + + args = parser.parse_args() + return args + + +if __name__ == '__main__': + args = parse_args() + preprocess_videos(args.RAW_VID_PATH, args.NUM_SAMPLES, args.NUM_SAMPLES) + frames_dir = os.path.join(os.path.dirname(args.RAW_VID_PATH), 'frames') + clips_dir = os.path.join(os.path.dirname(args.RAW_VID_PATH), 'clips') + generate_video_features(frames_dir, clips_dir) diff --git a/core/data/utils.py b/core/data/utils.py new file mode 100644 index 0000000..44696c5 --- /dev/null +++ b/core/data/utils.py @@ -0,0 +1,81 @@ +import en_vectors_web_lg, random, re, json +import numpy as np + +def tokenize(ques_list, use_glove): + token_to_ix = { + 'PAD': 0, + 'UNK': 1, + } + + spacy_tool = None + pretrained_emb = [] + if use_glove: + spacy_tool = en_vectors_web_lg.load() + pretrained_emb.append(spacy_tool('PAD').vector) + pretrained_emb.append(spacy_tool('UNK').vector) + + for ques in ques_list: + words = re.sub( + r"([.,'!?\"()*#:;])", + '', + ques.lower() + ).replace('-', ' ').replace('/', ' ').split() + + for word in words: + if word not in token_to_ix: + token_to_ix[word] = len(token_to_ix) + if use_glove: + pretrained_emb.append(spacy_tool(word).vector) + + pretrained_emb = np.array(pretrained_emb) + + return token_to_ix, pretrained_emb + + +def proc_ques(ques, token_to_ix, max_token): + ques_ix = np.zeros(max_token, np.int64) + + words = re.sub( + r"([.,'!?\"()*#:;])", + '', + ques.lower() + ).replace('-', ' ').replace('/', ' ').split() + q_len = 0 + for ix, word in enumerate(words): + if word in token_to_ix: + ques_ix[ix] = token_to_ix[word] + q_len += 1 + else: + ques_ix[ix] = token_to_ix['UNK'] + + if ix + 1 == max_token: + break + + return ques_ix, q_len, len(words) + +def ans_stat(ans_list): + ans_to_ix, ix_to_ans = {}, {} + for i, ans in enumerate(ans_list): + ans_to_ix[ans] = i + ix_to_ans[i] = ans + + return ans_to_ix, ix_to_ans + +def shuffle_list(ans_list): + random.shuffle(ans_list) + +def qlen_to_key(q_len): + if 1<= q_len <=3: + return '1-3' + if 4<= q_len <=8: + return '4-8' + if 9<= q_len: + return '9-15' + +def ans_to_key(ans_idx): + if 0 <= ans_idx <= 99 : + return '0-99' + if 100 <= ans_idx <= 299 : + return '100-299' + if 300 <= ans_idx <= 999 : + return '300-999' diff --git a/core/exec.py b/core/exec.py new file mode 100644 index 0000000..0d7cab3 --- /dev/null +++ b/core/exec.py @@ -0,0 +1,523 @@ +# -------------------------------------------------------- +# mcan-vqa (Deep Modular Co-Attention Networks) +# Licensed under The MIT License [see LICENSE for details] +# Written by Yuhao Cui https://github.com/cuiyuhao1996 +# -------------------------------------------------------- + +from core.data.dataset import VideoQA_Dataset +from core.model.net import Net1, Net2, Net3, Net4 +from core.model.optim import get_optim, adjust_lr +from core.metrics import get_acc +from tqdm import tqdm +from core.data.utils import shuffle_list + +import os, json, torch, datetime, pickle, copy, shutil, time, math +import numpy as np +import torch.nn as nn +import torch.utils.data as Data +from tensorboardX import SummaryWriter +from torch.autograd import Variable as var + +class Execution: + def __init__(self, __C): + self.__C = __C + print('Loading training set ........') + __C_train = copy.deepcopy(self.__C) + setattr(__C_train, 'RUN_MODE', 'train') + self.dataset = VideoQA_Dataset(__C_train) + + self.dataset_eval = None + if self.__C.EVAL_EVERY_EPOCH: + __C_eval = copy.deepcopy(self.__C) + setattr(__C_eval, 'RUN_MODE', 'val') + + print('Loading validation set for per-epoch evaluation ........') + self.dataset_eval = VideoQA_Dataset(__C_eval) + self.dataset_eval.ans_list = self.dataset.ans_list + self.dataset_eval.ans_to_ix, self.dataset_eval.ix_to_ans = self.dataset.ans_to_ix, self.dataset.ix_to_ans + self.dataset_eval.token_to_ix, self.dataset_eval.pretrained_emb = self.dataset.token_to_ix, self.dataset.pretrained_emb + + __C_test = copy.deepcopy(self.__C) + setattr(__C_test, 'RUN_MODE', 'test') + + self.dataset_test = VideoQA_Dataset(__C_test) + self.dataset_test.ans_list = self.dataset.ans_list + self.dataset_test.ans_to_ix, self.dataset_test.ix_to_ans = self.dataset.ans_to_ix, self.dataset.ix_to_ans + self.dataset_test.token_to_ix, self.dataset_test.pretrained_emb = self.dataset.token_to_ix, self.dataset.pretrained_emb + + self.writer = SummaryWriter(self.__C.TB_PATH) + + def train(self, dataset, dataset_eval=None): + # Obtain needed information + data_size = dataset.data_size + token_size = dataset.token_size + ans_size = dataset.ans_size + pretrained_emb = dataset.pretrained_emb + net = self.construct_net(self.__C.MODEL_TYPE) + if os.path.isfile(self.__C.PRETRAINED_PATH) and self.__C.MODEL_TYPE == 11: + print('Loading pretrained DNC-weigths') + net.load_pretrained_weights() + net.cuda() + net.train() + + # Define the multi-gpu training if needed + if self.__C.N_GPU > 1: + net = nn.DataParallel(net, device_ids=self.__C.DEVICES) + + # Define the binary cross entropy loss + # loss_fn = torch.nn.BCELoss(size_average=False).cuda() + loss_fn = torch.nn.BCELoss(reduction='sum').cuda() + # Load checkpoint if resume training + if self.__C.RESUME: + print(' ========== Resume training') + + if self.__C.CKPT_PATH is not None: + print('Warning: you are now using CKPT_PATH args, ' + 'CKPT_VERSION and CKPT_EPOCH will not work') + + path = self.__C.CKPT_PATH + else: + path = self.__C.CKPTS_PATH + \ + 'ckpt_' + self.__C.CKPT_VERSION + \ + '/epoch' + str(self.__C.CKPT_EPOCH) + '.pkl' + + # Load the network parameters + print('Loading ckpt {}'.format(path)) + ckpt = torch.load(path) + print('Finish!') + net.load_state_dict(ckpt['state_dict']) + + # Load the optimizer paramters + optim = get_optim(self.__C, net, data_size, ckpt['optim'], lr_base=ckpt['lr_base']) + optim._step = int(data_size / self.__C.BATCH_SIZE * self.__C.CKPT_EPOCH) + optim.optimizer.load_state_dict(ckpt['optimizer']) + + start_epoch = self.__C.CKPT_EPOCH + + else: + if ('ckpt_' + self.__C.VERSION) in os.listdir(self.__C.CKPTS_PATH): + shutil.rmtree(self.__C.CKPTS_PATH + 'ckpt_' + self.__C.VERSION) + + os.mkdir(self.__C.CKPTS_PATH + 'ckpt_' + self.__C.VERSION) + + optim = get_optim(self.__C, net, data_size, self.__C.OPTIM) + start_epoch = 0 + + loss_sum = 0 + named_params = list(net.named_parameters()) + grad_norm = np.zeros(len(named_params)) + + # Define multi-thread dataloader + if self.__C.SHUFFLE_MODE in ['external']: + dataloader = Data.DataLoader( + dataset, + batch_size=self.__C.BATCH_SIZE, + shuffle=False, + num_workers=self.__C.NUM_WORKERS, + pin_memory=self.__C.PIN_MEM, + drop_last=True + ) + else: + dataloader = Data.DataLoader( + dataset, + batch_size=self.__C.BATCH_SIZE, + shuffle=True, + num_workers=self.__C.NUM_WORKERS, + pin_memory=self.__C.PIN_MEM, + drop_last=True + ) + + # Training script + for epoch in range(start_epoch, self.__C.MAX_EPOCH): + + # Save log information + logfile = open( + self.__C.LOG_PATH + + 'log_run_' + self.__C.VERSION + '.txt', + 'a+' + ) + logfile.write( + 'nowTime: ' + + datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') + + '\n' + ) + logfile.close() + + # Learning Rate Decay + if epoch in self.__C.LR_DECAY_LIST: + adjust_lr(optim, self.__C.LR_DECAY_R) + + # Externally shuffle + if self.__C.SHUFFLE_MODE == 'external': + shuffle_list(dataset.ans_list) + + time_start = time.time() + # Iteration + for step, ( + ques_ix_iter, + frames_feat_iter, + clips_feat_iter, + ans_iter, + _, + _, + _, + _ + ) in enumerate(dataloader): + + ques_ix_iter = ques_ix_iter.cuda() + frames_feat_iter = frames_feat_iter.cuda() + clips_feat_iter = clips_feat_iter.cuda() + ans_iter = ans_iter.cuda() + + optim.zero_grad() + + for accu_step in range(self.__C.GRAD_ACCU_STEPS): + + sub_frames_feat_iter = \ + frames_feat_iter[accu_step * self.__C.SUB_BATCH_SIZE: + (accu_step + 1) * self.__C.SUB_BATCH_SIZE] + sub_clips_feat_iter = \ + clips_feat_iter[accu_step * self.__C.SUB_BATCH_SIZE: + (accu_step + 1) * self.__C.SUB_BATCH_SIZE] + sub_ques_ix_iter = \ + ques_ix_iter[accu_step * self.__C.SUB_BATCH_SIZE: + (accu_step + 1) * self.__C.SUB_BATCH_SIZE] + sub_ans_iter = \ + ans_iter[accu_step * self.__C.SUB_BATCH_SIZE: + (accu_step + 1) * self.__C.SUB_BATCH_SIZE] + + pred = net( + sub_frames_feat_iter, + sub_clips_feat_iter, + sub_ques_ix_iter + ) + + loss = loss_fn(pred, sub_ans_iter) + + # only mean-reduction needs be divided by grad_accu_steps + # removing this line wouldn't change our results because the speciality of Adam optimizer, + # but would be necessary if you use SGD optimizer. + # loss /= self.__C.GRAD_ACCU_STEPS + # start_backward = time.time() + loss.backward() + + if self.__C.VERBOSE: + if dataset_eval is not None: + mode_str = self.__C.SPLIT['train'] + '->' + self.__C.SPLIT['val'] + else: + mode_str = self.__C.SPLIT['train'] + '->' + self.__C.SPLIT['test'] + + # logging + + self.writer.add_scalar( + 'train/loss', + loss.cpu().data.numpy() / self.__C.SUB_BATCH_SIZE, + global_step=step + epoch * math.ceil(data_size / self.__C.BATCH_SIZE)) + + self.writer.add_scalar( + 'train/lr', + optim._rate, + global_step=step + epoch * math.ceil(data_size / self.__C.BATCH_SIZE)) + + print("\r[exp_name %s][version %s][epoch %2d][step %4d/%4d][%s] loss: %.4f, lr: %.2e" % ( + self.__C.EXP_NAME, + self.__C.VERSION, + epoch + 1, + step, + int(data_size / self.__C.BATCH_SIZE), + mode_str, + loss.cpu().data.numpy() / self.__C.SUB_BATCH_SIZE, + optim._rate, + ), end=' ') + + # Gradient norm clipping + if self.__C.GRAD_NORM_CLIP > 0: + nn.utils.clip_grad_norm_( + net.parameters(), + self.__C.GRAD_NORM_CLIP + ) + + # Save the gradient information + for name in range(len(named_params)): + norm_v = torch.norm(named_params[name][1].grad).cpu().data.numpy() \ + if named_params[name][1].grad is not None else 0 + grad_norm[name] += norm_v * self.__C.GRAD_ACCU_STEPS + + optim.step() + + time_end = time.time() + print('Finished in {}s'.format(int(time_end-time_start))) + + epoch_finish = epoch + 1 + + # Save checkpoint + state = { + 'state_dict': net.state_dict(), + 'optimizer': optim.optimizer.state_dict(), + 'lr_base': optim.lr_base, + 'optim': optim.lr_base, } + + torch.save( + state, + self.__C.CKPTS_PATH + + 'ckpt_' + self.__C.VERSION + + '/epoch' + str(epoch_finish) + + '.pkl' + ) + + # Logging + logfile = open( + self.__C.LOG_PATH + + 'log_run_' + self.__C.VERSION + '.txt', + 'a+' + ) + logfile.write( + 'epoch = ' + str(epoch_finish) + + ' loss = ' + str(loss_sum / data_size) + + '\n' + + 'lr = ' + str(optim._rate) + + '\n\n' + ) + logfile.close() + + # Eval after every epoch + if dataset_eval is not None: + self.eval( + net, + dataset_eval, + self.writer, + epoch, + valid=True, + ) + + loss_sum = 0 + grad_norm = np.zeros(len(named_params)) + + + # Evaluation + def eval(self, net, dataset, writer, epoch, valid=False): + + ans_ix_list = [] + pred_list = [] + q_type_list = [] + q_bin_list = [] + ans_rarity_list = [] + + ans_qtype_dict = {'what': [], 'who': [], 'how': [], 'when': [], 'where': []} + pred_qtype_dict = {'what': [], 'who': [], 'how': [], 'when': [], 'where': []} + + + ans_qlen_bin_dict = {'1-3': [], '4-8': [], '9-15': []} + pred_qlen_bin_dict = {'1-3': [], '4-8': [], '9-15': []} + + ans_ans_rarity_dict = {'0-99': [], '100-299': [], '300-999': []} + pred_ans_rarity_dict = {'0-99': [], '100-299': [], '300-999': []} + + data_size = dataset.data_size + + net.eval() + + if self.__C.N_GPU > 1: + net = nn.DataParallel(net, device_ids=self.__C.DEVICES) + + dataloader = Data.DataLoader( + dataset, + batch_size=self.__C.EVAL_BATCH_SIZE, + shuffle=False, + num_workers=self.__C.NUM_WORKERS, + pin_memory=True + ) + + for step, ( + ques_ix_iter, + frames_feat_iter, + clips_feat_iter, + _, + ans_iter, + q_type, + qlen_bin, + ans_rarity + ) in enumerate(dataloader): + print("\rEvaluation: [step %4d/%4d]" % ( + step, + int(data_size / self.__C.EVAL_BATCH_SIZE), + ), end=' ') + ques_ix_iter = ques_ix_iter.cuda() + frames_feat_iter = frames_feat_iter.cuda() + clips_feat_iter = clips_feat_iter.cuda() + with torch.no_grad(): + + pred = net( + frames_feat_iter, + clips_feat_iter, + ques_ix_iter + ) + + pred_np = pred.cpu().data.numpy() + pred_argmax = np.argmax(pred_np, axis=1) + pred_list.extend(pred_argmax) + ans_ix_list.extend(ans_iter.tolist()) + q_type_list.extend(q_type.tolist()) + q_bin_list.extend(qlen_bin.tolist()) + ans_rarity_list.extend(ans_rarity.tolist()) + + print('') + + assert len(pred_list) == len(ans_ix_list) == len(q_type_list) == len(q_bin_list) == len(ans_rarity_list) + pred_list = [dataset.ix_to_ans[pred] for pred in pred_list] + ans_ix_list = [dataset.ix_to_ans[ans] for ans in ans_ix_list] + + # Run validation script + scores_per_qtype = { + 'what': {}, + 'who': {}, + 'how': {}, + 'when': {}, + 'where': {}, + } + scores_per_qlen_bin = { + '1-3': {}, + '4-8': {}, + '9-15': {}, + } + scores_ans_rarity_dict = { + '0-99': {}, + '100-299': {}, + '300-999': {} + } + + if valid: + # create vqa object and vqaRes object + for pred, ans, q_type in zip(pred_list, ans_ix_list, q_type_list): + pred_qtype_dict[dataset.idx_to_qtypes[q_type]].append(pred) + ans_qtype_dict[dataset.idx_to_qtypes[q_type]].append(ans) + + print('----------------- Computing scores -----------------') + acc = get_acc(ans_ix_list, pred_list) + print('----------------- Overall -----------------') + print('acc: {}'.format(acc)) + writer.add_scalar('acc/overall', acc, global_step=epoch) + + for q_type in scores_per_qtype: + print('----------------- Computing "{}" q-type scores -----------------'.format(q_type)) + # acc, wups_0, wups_1 = get_scores( + # ans_ix_dict[q_type], pred_ix_dict[q_type]) + acc = get_acc(ans_qtype_dict[q_type], pred_qtype_dict[q_type]) + print('acc: {}'.format(acc)) + writer.add_scalar( + 'acc/{}'.format(q_type), acc, global_step=epoch) + else: + for pred, ans, q_type, qlen_bin, a_rarity in zip( + pred_list, ans_ix_list, q_type_list, q_bin_list, ans_rarity_list): + + pred_qtype_dict[dataset.idx_to_qtypes[q_type]].append(pred) + ans_qtype_dict[dataset.idx_to_qtypes[q_type]].append(ans) + + pred_qlen_bin_dict[dataset.idx_to_qlen_bins[qlen_bin]].append(pred) + ans_qlen_bin_dict[dataset.idx_to_qlen_bins[qlen_bin]].append(ans) + + pred_ans_rarity_dict[dataset.idx_to_ans_rare[a_rarity]].append(pred) + ans_ans_rarity_dict[dataset.idx_to_ans_rare[a_rarity]].append(ans) + + print('----------------- Computing overall scores -----------------') + acc = get_acc(ans_ix_list, pred_list) + + print('----------------- Overall -----------------') + print('acc:{}'.format(acc)) + + + print('----------------- Computing q-type scores -----------------') + for q_type in scores_per_qtype: + acc = get_acc(ans_qtype_dict[q_type], pred_qtype_dict[q_type]) + print(' {} '.format(q_type)) + print('acc:{}'.format(acc)) + + print('----------------- Computing qlen-bins scores -----------------') + for qlen_bin in scores_per_qlen_bin: + + acc = get_acc(ans_qlen_bin_dict[qlen_bin], pred_qlen_bin_dict[qlen_bin]) + print(' {} '.format(qlen_bin)) + print('acc:{}'.format(acc)) + + print('----------------- Computing ans-rarity scores -----------------') + for a_rarity in scores_ans_rarity_dict: + acc = get_acc(ans_ans_rarity_dict[a_rarity], pred_ans_rarity_dict[a_rarity]) + print(' {} '.format(a_rarity)) + print('acc:{}'.format(acc)) + net.train() + + def construct_net(self, model_type): + if model_type == 1: + net = Net1( + self.__C, + self.dataset.pretrained_emb, + self.dataset.token_size, + self.dataset.ans_size + ) + elif model_type == 2: + net = Net2( + self.__C, + self.dataset.pretrained_emb, + self.dataset.token_size, + self.dataset.ans_size + ) + elif model_type == 3: + net = Net3( + self.__C, + self.dataset.pretrained_emb, + self.dataset.token_size, + self.dataset.ans_size + ) + elif model_type == 4: + net = Net4( + self.__C, + self.dataset.pretrained_emb, + self.dataset.token_size, + self.dataset.ans_size + ) + else: + raise ValueError('Net{} is not supported'.format(model_type)) + return net + + def run(self, run_mode, epoch=None): + self.set_seed(self.__C.SEED) + if run_mode == 'train': + self.empty_log(self.__C.VERSION) + self.train(self.dataset, self.dataset_eval) + + elif run_mode == 'val': + self.eval(self.dataset, valid=True) + + elif run_mode == 'test': + net = self.construct_net(self.__C.MODEL_TYPE) + assert epoch is not None + path = self.__C.CKPTS_PATH + \ + 'ckpt_' + self.__C.VERSION + \ + '/epoch' + str(epoch) + '.pkl' + print('Loading ckpt {}'.format(path)) + state_dict = torch.load(path)['state_dict'] + net.load_state_dict(state_dict) + net.cuda() + self.eval(net, self.dataset_test, self.writer, 0) + + else: + exit(-1) + + def set_seed(self, seed): + """Sets the seed for reproducibility. + Args: + seed (int): The seed used + """ + torch.manual_seed(seed) + torch.cuda.manual_seed(seed) + torch.backends.cudnn.deterministic = True + torch.backends.cudnn.benchmark = False + np.random.seed(seed) + print('\nSeed set to {}...\n'.format(seed)) + + def empty_log(self, version): + print('Initializing log file ........') + if (os.path.exists(self.__C.LOG_PATH + 'log_run_' + version + '.txt')): + os.remove(self.__C.LOG_PATH + 'log_run_' + version + '.txt') + print('Finished!') + print('') diff --git a/core/metrics.py b/core/metrics.py new file mode 100644 index 0000000..0a45f1f --- /dev/null +++ b/core/metrics.py @@ -0,0 +1,211 @@ +""" +Author: Mateusz Malinowski +Email: mmalinow@mpi-inf.mpg.de + +The script assumes there are two files +- first file with ground truth answers +- second file with predicted answers +both answers are line-aligned + +The script also assumes that answer items are comma separated. +For instance, chair,table,window + +It is also a set measure, so not exactly the same as accuracy +even if dirac measure is used since {book,book}=={book}, also {book,chair}={chair,book} + +Logs: + 05.09.2015 - white spaces surrounding words are stripped away so that {book, chair}={book,chair} +""" + +import sys + +#import enchant + +from numpy import prod +from nltk.corpus import wordnet as wn +from tqdm import tqdm + +def file2list(filepath): + with open(filepath,'r') as f: + lines =[k for k in + [k.strip() for k in f.readlines()] + if len(k) > 0] + + return lines + + +def list2file(filepath,mylist): + mylist='\n'.join(mylist) + with open(filepath,'w') as f: + f.writelines(mylist) + + +def items2list(x): + """ + x - string of comma-separated answer items + """ + return [l.strip() for l in x.split(',')] + + +def fuzzy_set_membership_measure(x,A,m): + """ + Set membership measure. + x: element + A: set of elements + m: point-wise element-to-element measure m(a,b) ~ similarity(a,b) + + This function implments a fuzzy set membership measure: + m(x \in A) = max_{a \in A} m(x,a)} + """ + return 0 if A==[] else max(map(lambda a: m(x,a), A)) + + +def score_it(A,T,m): + """ + A: list of A items + T: list of T items + m: set membership measure + m(a \in A) gives a membership quality of a into A + + This function implements a fuzzy accuracy score: + score(A,T) = min{prod_{a \in A} m(a \in T), prod_{t \in T} m(a \in A)} + where A and T are set representations of the answers + and m is a measure + """ + if A==[] and T==[]: + return 1 + + # print A,T + + score_left=0 if A==[] else prod(list(map(lambda a: m(a,T), A))) + score_right=0 if T==[] else prod(list(map(lambda t: m(t,A),T))) + return min(score_left,score_right) + + +# implementations of different measure functions +def dirac_measure(a,b): + """ + Returns 1 iff a=b and 0 otherwise. + """ + if a==[] or b==[]: + return 0.0 + return float(a==b) + + +def wup_measure(a,b,similarity_threshold=0.925): + """ + Returns Wu-Palmer similarity score. + More specifically, it computes: + max_{x \in interp(a)} max_{y \in interp(b)} wup(x,y) + where interp is a 'interpretation field' + """ + def get_semantic_field(a): + weight = 1.0 + semantic_field = wn.synsets(a,pos=wn.NOUN) + return (semantic_field,weight) + + + def get_stem_word(a): + """ + Sometimes answer has form word\d+:wordid. + If so we return word and downweight + """ + weight = 1.0 + return (a,weight) + + + global_weight=1.0 + + (a,global_weight_a)=get_stem_word(a) + (b,global_weight_b)=get_stem_word(b) + global_weight = min(global_weight_a,global_weight_b) + + if a==b: + # they are the same + return 1.0*global_weight + + if a==[] or b==[]: + return 0 + + + interp_a,weight_a = get_semantic_field(a) + interp_b,weight_b = get_semantic_field(b) + + if interp_a == [] or interp_b == []: + return 0 + + # we take the most optimistic interpretation + global_max=0.0 + for x in interp_a: + for y in interp_b: + local_score=x.wup_similarity(y) + if local_score > global_max: + global_max=local_score + + # we need to use the semantic fields and therefore we downweight + # unless the score is high which indicates both are synonyms + if global_max < similarity_threshold: + interp_weight = 0.1 + else: + interp_weight = 1.0 + + final_score=global_max*weight_a*weight_b*interp_weight*global_weight + return final_score +### + + +def get_scores(input_gt, input_pred, threshold_0=0.0, threshold_1=0.9): + element_membership_acc=dirac_measure + element_membership_wups_0=lambda x,y: wup_measure(x,y,threshold_0) + element_membership_wups_1=lambda x,y: wup_measure(x,y,threshold_1) + + set_membership_acc=\ + lambda x,A: fuzzy_set_membership_measure(x,A,element_membership_acc) + set_membership_wups_0=\ + lambda x,A: fuzzy_set_membership_measure(x,A,element_membership_wups_0) + set_membership_wups_1=\ + lambda x,A: fuzzy_set_membership_measure(x,A,element_membership_wups_1) + + score_list_acc = [] + score_list_wups_0 = [] + score_list_wups_1 = [] + pbar = tqdm(zip(input_gt,input_pred)) + pbar.set_description('Computing Acc') + + for (ta,pa) in pbar: + score_list_acc.append(score_it(items2list(ta),items2list(pa),set_membership_acc)) + + #final_score=sum(map(lambda x:float(x)/float(len(score_list)),score_list)) + final_score_acc=float(sum(score_list_acc))/float(len(score_list_acc)) + final_score_acc *= 100.0 + + pbar = tqdm(zip(input_gt,input_pred)) + pbar.set_description('Computing Wups_0.0') + for (ta,pa) in pbar: + score_list_wups_0.append(score_it(items2list(ta),items2list(pa),set_membership_wups_0)) + #final_score=sum(map(lambda x:float(x)/float(len(score_list)),score_list)) + final_score_wups_0=float(sum(score_list_wups_0))/float(len(score_list_wups_0)) + final_score_wups_0 *= 100.0 + + pbar = tqdm(zip(input_gt,input_pred)) + pbar.set_description('Computing Wups_0.9') + for (ta,pa) in pbar: + score_list_wups_1.append(score_it(items2list(ta),items2list(pa),set_membership_wups_1)) + #final_score=sum(map(lambda x:float(x)/float(len(score_list)),score_list)) + final_score_wups_1=float(sum(score_list_wups_1))/float(len(score_list_wups_1)) + final_score_wups_1 *= 100.0 + + # filtering to obtain the results + #print 'full score:', score_list + # print('accuracy = {0:.2f} | WUPS@{1} = {2:.2f} | WUPS@{3} = {4:.2f}'.format( + # final_score_acc, threshold_0, final_score_wups_0, threshold_1, final_score_wups_1)) + return final_score_acc, final_score_wups_0, final_score_wups_1 + +def get_acc(gts, preds): + sum_correct = 0 + assert len(gts) == len(preds) + for gt, pred in zip(gts, preds): + if gt == pred: + sum_correct += 1 + acc = 100.0 * float(sum_correct/ len(gts)) + return acc diff --git a/core/model/.gitkeep b/core/model/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/core/model/C3D.py b/core/model/C3D.py new file mode 100644 index 0000000..198b002 --- /dev/null +++ b/core/model/C3D.py @@ -0,0 +1,80 @@ +""" +from https://github.com/DavideA/c3d-pytorch/blob/master/C3D_model.py +""" + + +import torch.nn as nn + + +class C3D(nn.Module): + """ + The C3D network as described in [1]. + """ + + def __init__(self): + super(C3D, self).__init__() + + self.conv1 = nn.Conv3d(3, 64, kernel_size=(3, 3, 3), padding=(1, 1, 1)) + self.pool1 = nn.MaxPool3d(kernel_size=(1, 2, 2), stride=(1, 2, 2)) + + self.conv2 = nn.Conv3d(64, 128, kernel_size=(3, 3, 3), padding=(1, 1, 1)) + self.pool2 = nn.MaxPool3d(kernel_size=(2, 2, 2), stride=(2, 2, 2)) + + self.conv3a = nn.Conv3d(128, 256, kernel_size=(3, 3, 3), padding=(1, 1, 1)) + self.conv3b = nn.Conv3d(256, 256, kernel_size=(3, 3, 3), padding=(1, 1, 1)) + self.pool3 = nn.MaxPool3d(kernel_size=(2, 2, 2), stride=(2, 2, 2)) + + self.conv4a = nn.Conv3d(256, 512, kernel_size=(3, 3, 3), padding=(1, 1, 1)) + self.conv4b = nn.Conv3d(512, 512, kernel_size=(3, 3, 3), padding=(1, 1, 1)) + self.pool4 = nn.MaxPool3d(kernel_size=(2, 2, 2), stride=(2, 2, 2)) + + self.conv5a = nn.Conv3d(512, 512, kernel_size=(3, 3, 3), padding=(1, 1, 1)) + self.conv5b = nn.Conv3d(512, 512, kernel_size=(3, 3, 3), padding=(1, 1, 1)) + self.pool5 = nn.MaxPool3d(kernel_size=(2, 2, 2), stride=(2, 2, 2), padding=(0, 1, 1)) + + self.fc6 = nn.Linear(8192, 4096) + self.fc7 = nn.Linear(4096, 4096) + self.fc8 = nn.Linear(4096, 487) + + self.dropout = nn.Dropout(p=0.5) + + self.relu = nn.ReLU() + self.softmax = nn.Softmax() + + def forward(self, x): + + h = self.relu(self.conv1(x)) + h = self.pool1(h) + + h = self.relu(self.conv2(h)) + h = self.pool2(h) + + h = self.relu(self.conv3a(h)) + h = self.relu(self.conv3b(h)) + h = self.pool3(h) + + h = self.relu(self.conv4a(h)) + h = self.relu(self.conv4b(h)) + h = self.pool4(h) + + h = self.relu(self.conv5a(h)) + h = self.relu(self.conv5b(h)) + h = self.pool5(h) + + h = h.view(-1, 8192) + h = self.relu(self.fc6(h)) + h = self.dropout(h) + h = self.relu(self.fc7(h)) + # h = self.dropout(h) + + # logits = self.fc8(h) + # probs = self.softmax(logits) + + return h + +""" +References +---------- +[1] Tran, Du, et al. "Learning spatiotemporal features with 3d convolutional networks." +Proceedings of the IEEE international conference on computer vision. 2015. +""" diff --git a/core/model/dnc.py b/core/model/dnc.py new file mode 100644 index 0000000..9fe8fd0 --- /dev/null +++ b/core/model/dnc.py @@ -0,0 +1,323 @@ +""" +PyTorch DNC implementation from +--> +https://github.com/ixaxaar/pytorch-dnc +<-- +""" +# -*- coding: utf-8 -*- + + +import torch.nn as nn +import torch as T +from torch.autograd import Variable as var +import numpy as np + +from torch.nn.utils.rnn import pad_packed_sequence as pad +from torch.nn.utils.rnn import pack_padded_sequence as pack +from torch.nn.utils.rnn import PackedSequence + +from .util import * +from .memory import * + +from torch.nn.init import orthogonal_, xavier_uniform_ + + +class DNC(nn.Module): + + def __init__( + self, + input_size, + hidden_size, + rnn_type='lstm', + num_layers=1, + num_hidden_layers=2, + bias=True, + batch_first=True, + dropout=0, + bidirectional=False, + nr_cells=5, + read_heads=2, + cell_size=10, + nonlinearity='tanh', + gpu_id=-1, + independent_linears=False, + share_memory=True, + debug=False, + clip=20 + ): + super(DNC, self).__init__() + # todo: separate weights and RNNs for the interface and output vectors + + self.input_size = input_size + self.hidden_size = hidden_size + self.rnn_type = rnn_type + self.num_layers = num_layers + self.num_hidden_layers = num_hidden_layers + self.bias = bias + self.batch_first = batch_first + self.dropout = dropout + self.bidirectional = bidirectional + self.nr_cells = nr_cells + self.read_heads = read_heads + self.cell_size = cell_size + self.nonlinearity = nonlinearity + self.gpu_id = gpu_id + self.independent_linears = independent_linears + self.share_memory = share_memory + self.debug = debug + self.clip = clip + + self.w = self.cell_size + self.r = self.read_heads + + self.read_vectors_size = self.r * self.w + self.output_size = self.hidden_size + + self.nn_input_size = self.input_size + self.read_vectors_size + self.nn_output_size = self.output_size + self.read_vectors_size + + self.rnns = [] + self.memories = [] + + for layer in range(self.num_layers): + if self.rnn_type.lower() == 'rnn': + self.rnns.append(nn.RNN((self.nn_input_size if layer == 0 else self.nn_output_size), self.output_size, + bias=self.bias, nonlinearity=self.nonlinearity, batch_first=True, dropout=self.dropout, num_layers=self.num_hidden_layers)) + elif self.rnn_type.lower() == 'gru': + self.rnns.append(nn.GRU((self.nn_input_size if layer == 0 else self.nn_output_size), + self.output_size, bias=self.bias, batch_first=True, dropout=self.dropout, num_layers=self.num_hidden_layers)) + if self.rnn_type.lower() == 'lstm': + self.rnns.append(nn.LSTM((self.nn_input_size if layer == 0 else self.nn_output_size), + self.output_size, bias=self.bias, batch_first=True, dropout=self.dropout, num_layers=self.num_hidden_layers)) + setattr(self, self.rnn_type.lower() + '_layer_' + str(layer), self.rnns[layer]) + + # memories for each layer + if not self.share_memory: + self.memories.append( + Memory( + input_size=self.output_size, + mem_size=self.nr_cells, + cell_size=self.w, + read_heads=self.r, + gpu_id=self.gpu_id, + independent_linears=self.independent_linears + ) + ) + setattr(self, 'rnn_layer_memory_' + str(layer), self.memories[layer]) + + # only one memory shared by all layers + if self.share_memory: + self.memories.append( + Memory( + input_size=self.output_size, + mem_size=self.nr_cells, + cell_size=self.w, + read_heads=self.r, + gpu_id=self.gpu_id, + independent_linears=self.independent_linears + ) + ) + setattr(self, 'rnn_layer_memory_shared', self.memories[0]) + + # final output layer + self.output = nn.Linear(self.nn_output_size, self.output_size) + orthogonal_(self.output.weight) + + if self.gpu_id != -1: + [x.cuda(self.gpu_id) for x in self.rnns] + [x.cuda(self.gpu_id) for x in self.memories] + self.output.cuda() + + def _init_hidden(self, hx, batch_size, reset_experience): + # create empty hidden states if not provided + if hx is None: + hx = (None, None, None) + (chx, mhx, last_read) = hx + + # initialize hidden state of the controller RNN + if chx is None: + h = cuda(T.zeros(self.num_hidden_layers, batch_size, self.output_size), gpu_id=self.gpu_id) + xavier_uniform_(h) + + chx = [ (h, h) if self.rnn_type.lower() == 'lstm' else h for x in range(self.num_layers)] + + # Last read vectors + if last_read is None: + last_read = cuda(T.zeros(batch_size, self.w * self.r), gpu_id=self.gpu_id) + + # memory states + if mhx is None: + if self.share_memory: + mhx = self.memories[0].reset(batch_size, erase=reset_experience) + else: + mhx = [m.reset(batch_size, erase=reset_experience) for m in self.memories] + else: + if self.share_memory: + mhx = self.memories[0].reset(batch_size, mhx, erase=reset_experience) + else: + mhx = [m.reset(batch_size, h, erase=reset_experience) for m, h in zip(self.memories, mhx)] + + return chx, mhx, last_read + + def _debug(self, mhx, debug_obj): + if not debug_obj: + debug_obj = { + 'memory': [], + 'link_matrix': [], + 'precedence': [], + 'read_weights': [], + 'write_weights': [], + 'usage_vector': [], + } + + debug_obj['memory'].append(mhx['memory'][0].data.cpu().numpy()) + debug_obj['link_matrix'].append(mhx['link_matrix'][0][0].data.cpu().numpy()) + debug_obj['precedence'].append(mhx['precedence'][0].data.cpu().numpy()) + debug_obj['read_weights'].append(mhx['read_weights'][0].data.cpu().numpy()) + debug_obj['write_weights'].append(mhx['write_weights'][0].data.cpu().numpy()) + debug_obj['usage_vector'].append(mhx['usage_vector'][0].unsqueeze(0).data.cpu().numpy()) + return debug_obj + + def _layer_forward(self, input, layer, hx=(None, None), pass_through_memory=True): + (chx, mhx) = hx + + # pass through the controller layer + input, chx = self.rnns[layer](input.unsqueeze(1), chx) + input = input.squeeze(1) + + # clip the controller output + if self.clip != 0: + output = T.clamp(input, -self.clip, self.clip) + else: + output = input + + # the interface vector + ξ = output + + # pass through memory + if pass_through_memory: + if self.share_memory: + read_vecs, mhx = self.memories[0](ξ, mhx) + else: + read_vecs, mhx = self.memories[layer](ξ, mhx) + # the read vectors + read_vectors = read_vecs.view(-1, self.w * self.r) + else: + read_vectors = None + + return output, (chx, mhx, read_vectors) + + def forward(self, input, hx=(None, None, None), reset_experience=False, pass_through_memory=True): + # handle packed data + is_packed = type(input) is PackedSequence + if is_packed: + input, lengths = pad(input) + max_length = lengths[0] + else: + max_length = input.size(1) if self.batch_first else input.size(0) + lengths = [input.size(1)] * max_length if self.batch_first else [input.size(0)] * max_length + + batch_size = input.size(0) if self.batch_first else input.size(1) + + if not self.batch_first: + input = input.transpose(0, 1) + # make the data time-first + + controller_hidden, mem_hidden, last_read = self._init_hidden(hx, batch_size, reset_experience) + + # concat input with last read (or padding) vectors + inputs = [T.cat([input[:, x, :], last_read], 1) for x in range(max_length)] + + # batched forward pass per element / word / etc + if self.debug: + viz = None + + outs = [None] * max_length + read_vectors = None + rv = [None] * max_length + # pass through time + for time in range(max_length): + # pass thorugh layers + for layer in range(self.num_layers): + # this layer's hidden states + chx = controller_hidden[layer] + m = mem_hidden if self.share_memory else mem_hidden[layer] + # pass through controller + outs[time], (chx, m, read_vectors) = \ + self._layer_forward(inputs[time], layer, (chx, m), pass_through_memory) + + # debug memory + if self.debug: + viz = self._debug(m, viz) + + # store the memory back (per layer or shared) + if self.share_memory: + mem_hidden = m + else: + mem_hidden[layer] = m + controller_hidden[layer] = chx + + if read_vectors is not None: + # the controller output + read vectors go into next layer + outs[time] = T.cat([outs[time], read_vectors], 1) + if layer == self.num_layers - 1: + rv[time] = read_vectors.reshape(batch_size, self.r, self.w) + else: + outs[time] = T.cat([outs[time], last_read], 1) + inputs[time] = outs[time] + + if self.debug: + viz = {k: np.array(v) for k, v in viz.items()} + viz = {k: v.reshape(v.shape[0], v.shape[1] * v.shape[2]) for k, v in viz.items()} + + # pass through final output layer + inputs = [self.output(i) for i in inputs] + outputs = T.stack(inputs, 1 if self.batch_first else 0) + + if is_packed: + outputs = pack(output, lengths) + + if self.debug: + return outputs, (controller_hidden, mem_hidden, read_vectors), rv, viz + else: + return outputs, (controller_hidden, mem_hidden, read_vectors), rv + + def __repr__(self): + s = "\n----------------------------------------\n" + s += '{name}({input_size}, {hidden_size}' + if self.rnn_type != 'lstm': + s += ', rnn_type={rnn_type}' + if self.num_layers != 1: + s += ', num_layers={num_layers}' + if self.num_hidden_layers != 2: + s += ', num_hidden_layers={num_hidden_layers}' + if self.bias != True: + s += ', bias={bias}' + if self.batch_first != True: + s += ', batch_first={batch_first}' + if self.dropout != 0: + s += ', dropout={dropout}' + if self.bidirectional != False: + s += ', bidirectional={bidirectional}' + if self.nr_cells != 5: + s += ', nr_cells={nr_cells}' + if self.read_heads != 2: + s += ', read_heads={read_heads}' + if self.cell_size != 10: + s += ', cell_size={cell_size}' + if self.nonlinearity != 'tanh': + s += ', nonlinearity={nonlinearity}' + if self.gpu_id != -1: + s += ', gpu_id={gpu_id}' + if self.independent_linears != False: + s += ', independent_linears={independent_linears}' + if self.share_memory != True: + s += ', share_memory={share_memory}' + if self.debug != False: + s += ', debug={debug}' + if self.clip != 20: + s += ', clip={clip}' + + s += ")\n" + super(DNC, self).__repr__() + \ + "\n----------------------------------------\n" + return s.format(name=self.__class__.__name__, **self.__dict__) diff --git a/core/model/mca.py b/core/model/mca.py new file mode 100644 index 0000000..b373287 --- /dev/null +++ b/core/model/mca.py @@ -0,0 +1,208 @@ +# -------------------------------------------------------- +# mcan-vqa (Deep Modular Co-Attention Networks) +# Licensed under The MIT License [see LICENSE for details] +# Written by Yuhao Cui https://github.com/cuiyuhao1996 +# -------------------------------------------------------- + +from core.model.net_utils import FC, MLP, LayerNorm +from core.model.dnc_improved import DNC, SharedMemDNC +from core.model.dnc_improved import FeedforwardController +import torch.nn as nn +import torch.nn.functional as F +import torch, math +import time + + +# ------------------------------ +# ---- Multi-Head Attention ---- +# ------------------------------ + +class MHAtt(nn.Module): + def __init__(self, __C): + super(MHAtt, self).__init__() + self.__C = __C + + self.linear_v = nn.Linear(__C.HIDDEN_SIZE, __C.HIDDEN_SIZE) + self.linear_k = nn.Linear(__C.HIDDEN_SIZE, __C.HIDDEN_SIZE) + self.linear_q = nn.Linear(__C.HIDDEN_SIZE, __C.HIDDEN_SIZE) + self.linear_merge = nn.Linear(__C.HIDDEN_SIZE, __C.HIDDEN_SIZE) + + self.dropout = nn.Dropout(__C.DROPOUT_R) + + def forward(self, v, k, q, mask): + n_batches = q.size(0) + + v = self.linear_v(v).view( + n_batches, + -1, + self.__C.MULTI_HEAD, + self.__C.HIDDEN_SIZE_HEAD + ).transpose(1, 2) + + k = self.linear_k(k).view( + n_batches, + -1, + self.__C.MULTI_HEAD, + self.__C.HIDDEN_SIZE_HEAD + ).transpose(1, 2) + + q = self.linear_q(q).view( + n_batches, + -1, + self.__C.MULTI_HEAD, + self.__C.HIDDEN_SIZE_HEAD + ).transpose(1, 2) + + atted = self.att(v, k, q, mask) + atted = atted.transpose(1, 2).contiguous().view( + n_batches, + -1, + self.__C.HIDDEN_SIZE + ) + + atted = self.linear_merge(atted) + + return atted + + def att(self, value, key, query, mask): + d_k = query.size(-1) + + scores = torch.matmul( + query, key.transpose(-2, -1) + ) / math.sqrt(d_k) + + if mask is not None: + scores = scores.masked_fill(mask, -1e9) + + att_map = F.softmax(scores, dim=-1) + att_map = self.dropout(att_map) + + return torch.matmul(att_map, value) + + + +# --------------------------- +# ---- Feed Forward Nets ---- +# --------------------------- + +class FFN(nn.Module): + def __init__(self, __C): + super(FFN, self).__init__() + + self.mlp = MLP( + in_size=__C.HIDDEN_SIZE, + mid_size=__C.FF_SIZE, + out_size=__C.HIDDEN_SIZE, + dropout_r=__C.DROPOUT_R, + use_relu=True + ) + + def forward(self, x): + return self.mlp(x) + + +# ------------------------ +# ---- Self Attention ---- +# ------------------------ + +class SA(nn.Module): + def __init__(self, __C): + super(SA, self).__init__() + self.mhatt = MHAtt(__C) + self.ffn = FFN(__C) + + self.dropout1 = nn.Dropout(__C.DROPOUT_R) + self.norm1 = LayerNorm(__C.HIDDEN_SIZE) + + self.dropout2 = nn.Dropout(__C.DROPOUT_R) + self.norm2 = LayerNorm(__C.HIDDEN_SIZE) + + def forward(self, x, x_mask): + x = self.norm1(x + self.dropout1( + self.mhatt(x, x, x, x_mask) + )) + + x = self.norm2(x + self.dropout2( + self.ffn(x) + )) + + return x + +# ------------------------------- +# ---- Self Guided Attention ---- +# ------------------------------- + +class SGA(nn.Module): + def __init__(self, __C): + super(SGA, self).__init__() + + self.mhatt1 = MHAtt(__C) + self.mhatt2 = MHAtt(__C) + self.ffn = FFN(__C) + + self.dropout1 = nn.Dropout(__C.DROPOUT_R) + self.norm1 = LayerNorm(__C.HIDDEN_SIZE) + + self.dropout2 = nn.Dropout(__C.DROPOUT_R) + self.norm2 = LayerNorm(__C.HIDDEN_SIZE) + + self.dropout3 = nn.Dropout(__C.DROPOUT_R) + self.norm3 = LayerNorm(__C.HIDDEN_SIZE) + + def forward(self, x, y, x_mask, y_mask): + x = self.norm1(x + self.dropout1( + self.mhatt1(x, x, x, x_mask) + )) + + x = self.norm2(x + self.dropout2( + self.mhatt2(y, y, x, y_mask) + )) + + x = self.norm3(x + self.dropout3( + self.ffn(x) + )) + + return x + + +# ------------------------------------------------ +# ---- MAC Layers Cascaded by Encoder-Decoder ---- +# ------------------------------------------------ + +class MCA_ED(nn.Module): + def __init__(self, __C): + super(MCA_ED, self).__init__() + + self.enc_list = nn.ModuleList([SA(__C) for _ in range(__C.LAYER)]) + self.dec_list = nn.ModuleList([SGA(__C) for _ in range(__C.LAYER)]) + + def forward(self, x, y, x_mask, y_mask): + # Get hidden vector + for enc in self.enc_list: + x = enc(x, x_mask) + + for dec in self.dec_list: + y = dec(y, x, y_mask, x_mask) + return x, y + +class VLC(nn.Module): + def __init__(self, __C): + super(VLC, self).__init__() + + self.enc_list = nn.ModuleList([SA(__C) for _ in range(__C.LAYER)]) + self.dec_lang_frames_list = nn.ModuleList([SGA(__C) for _ in range(__C.LAYER)]) + self.dec_lang_clips_list = nn.ModuleList([SGA(__C) for _ in range(__C.LAYER)]) + + + def forward(self, x, y, z, x_mask, y_mask, z_mask): + # Get hidden vector + for enc in self.enc_list: + x = enc(x, x_mask) + + for dec in self.dec_lang_frames_list: + y = dec(y, x, y_mask, x_mask) + + for dec in self.dec_lang_clips_list: + z = dec(z, x, z_mask, x_mask) + return x, y, z + diff --git a/core/model/memory.py b/core/model/memory.py new file mode 100644 index 0000000..97f1f11 --- /dev/null +++ b/core/model/memory.py @@ -0,0 +1,314 @@ +""" +PyTorch DNC implementation from +--> +https://github.com/ixaxaar/pytorch-dnc +<-- +""" +# -*- coding: utf-8 -*- + +import torch.nn as nn +import torch as T +from torch.autograd import Variable as var +import torch.nn.functional as F +import numpy as np + +from core.model.util import * + + +class Memory(nn.Module): + + def __init__(self, input_size, mem_size=512, cell_size=32, read_heads=4, gpu_id=-1, independent_linears=True): + super(Memory, self).__init__() + + self.input_size = input_size + self.mem_size = mem_size + self.cell_size = cell_size + self.read_heads = read_heads + self.gpu_id = gpu_id + self.independent_linears = independent_linears + + m = self.mem_size + w = self.cell_size + r = self.read_heads + + if self.independent_linears: + self.read_keys_transform = nn.Linear(self.input_size, w * r) + self.read_strengths_transform = nn.Linear(self.input_size, r) + self.write_key_transform = nn.Linear(self.input_size, w) + self.write_strength_transform = nn.Linear(self.input_size, 1) + self.erase_vector_transform = nn.Linear(self.input_size, w) + self.write_vector_transform = nn.Linear(self.input_size, w) + self.free_gates_transform = nn.Linear(self.input_size, r) + self.allocation_gate_transform = nn.Linear(self.input_size, 1) + self.write_gate_transform = nn.Linear(self.input_size, 1) + self.read_modes_transform = nn.Linear(self.input_size, 3 * r) + else: + self.interface_size = (w * r) + (3 * w) + (5 * r) + 3 + self.interface_weights = nn.Linear( + self.input_size, self.interface_size) + + self.I = cuda(1 - T.eye(m).unsqueeze(0), + gpu_id=self.gpu_id) # (1 * n * n) + + def reset(self, batch_size=1, hidden=None, erase=True): + m = self.mem_size + w = self.cell_size + r = self.read_heads + b = batch_size + + if hidden is None: + return { + 'memory': cuda(T.zeros(b, m, w).fill_(0), gpu_id=self.gpu_id), + 'link_matrix': cuda(T.zeros(b, 1, m, m), gpu_id=self.gpu_id), + 'precedence': cuda(T.zeros(b, 1, m), gpu_id=self.gpu_id), + 'read_weights': cuda(T.zeros(b, r, m).fill_(0), gpu_id=self.gpu_id), + 'write_weights': cuda(T.zeros(b, 1, m).fill_(0), gpu_id=self.gpu_id), + 'usage_vector': cuda(T.zeros(b, m), gpu_id=self.gpu_id), + # 'free_gates': cuda(T.zeros(b, r), gpu_id=self.gpu_id), + # 'alloc_gates': cuda(T.zeros(b, 1), gpu_id=self.gpu_id), + # 'write_gates': cuda(T.zeros(b, 1), gpu_id=self.gpu_id), + # 'read_modes': cuda(T.zeros(b, r, 3), gpu_id=self.gpu_id) + } + else: + hidden['memory'] = hidden['memory'].clone() + hidden['link_matrix'] = hidden['link_matrix'].clone() + hidden['precedence'] = hidden['precedence'].clone() + hidden['read_weights'] = hidden['read_weights'].clone() + hidden['write_weights'] = hidden['write_weights'].clone() + hidden['usage_vector'] = hidden['usage_vector'].clone() + # hidden['free_gates'] = hidden['free_gates'].clone() + # hidden['alloc_gates'] = hidden['alloc_gates'].clone() + # hidden['write_gates'] = hidden['write_gates'].clone() + # hidden['read_modes'] = hidden['read_modes'].clone() + + if erase: + hidden['memory'].data.fill_(0) + hidden['link_matrix'].data.zero_() + hidden['precedence'].data.zero_() + hidden['read_weights'].data.fill_(0) + hidden['write_weights'].data.fill_(0) + hidden['usage_vector'].data.zero_() + # hidden['free_gates'].data.fill_() + # hidden['alloc_gates'].data.fill_() + # hidden['write_gates'].data.fill_() + # hidden['read_modes'].data.fill_() + + return hidden + + def get_usage_vector(self, usage, free_gates, read_weights, write_weights): + # write_weights = write_weights.detach() # detach from the computation graph + # if read_weights.size(0) > free_gates.size(0): + # read_weights = read_weights[:free_gates.size(0), :, :] + # if usage.size(0) > free_gates.size(0): + # usage = usage[:free_gates.size(0), :] + # if write_weights.size(0) > free_gates.size(0): + # write_weights = write_weights[:free_gates.size(0), :, :] + usage = usage + (1 - usage) * (1 - T.prod(1 - write_weights, 1)) + ψ = T.prod(1 - free_gates.unsqueeze(2) * read_weights, 1) + return usage * ψ + + def allocate(self, usage, write_gate): + # ensure values are not too small prior to cumprod. + usage = δ + (1 - δ) * usage + batch_size = usage.size(0) + # free list + sorted_usage, φ = T.topk(usage, self.mem_size, dim=1, largest=False) + + # cumprod with exclusive=True + # https://discuss.pytorch.org/t/cumprod-exclusive-true-equivalences/2614/8 + v = var(sorted_usage.data.new(batch_size, 1).fill_(1)) + cat_sorted_usage = T.cat((v, sorted_usage), 1) + prod_sorted_usage = T.cumprod(cat_sorted_usage, 1)[:, :-1] + + sorted_allocation_weights = (1 - sorted_usage) * prod_sorted_usage.squeeze() + + # construct the reverse sorting index https://stackoverflow.com/questions/2483696/undo-or-reverse-argsort-python + _, φ_rev = T.topk(φ, k=self.mem_size, dim=1, largest=False) + allocation_weights = sorted_allocation_weights.gather(1, φ_rev.long()) + + return allocation_weights.unsqueeze(1), usage + + def write_weighting(self, memory, write_content_weights, allocation_weights, write_gate, allocation_gate): + ag = allocation_gate.unsqueeze(-1) + wg = write_gate.unsqueeze(-1) + + return wg * (ag * allocation_weights + (1 - ag) * write_content_weights) + + def get_link_matrix(self, link_matrix, write_weights, precedence): + precedence = precedence.unsqueeze(2) + write_weights_i = write_weights.unsqueeze(3) + write_weights_j = write_weights.unsqueeze(2) + + prev_scale = 1 - write_weights_i - write_weights_j + new_link_matrix = write_weights_i * precedence + + link_matrix = prev_scale * link_matrix + new_link_matrix + # trick to delete diag elems + return self.I.expand_as(link_matrix) * link_matrix + + def update_precedence(self, precedence, write_weights): + return (1 - T.sum(write_weights, 2, keepdim=True)) * precedence + write_weights + + def write(self, write_key, write_vector, erase_vector, free_gates, read_strengths, write_strength, write_gate, allocation_gate, hidden): + # get current usage + hidden['usage_vector'] = self.get_usage_vector( + hidden['usage_vector'], + free_gates, + hidden['read_weights'], + hidden['write_weights'] + ) + + # lookup memory with write_key and write_strength + write_content_weights = self.content_weightings( + hidden['memory'], write_key, write_strength) + + # get memory allocation + alloc, _ = self.allocate( + hidden['usage_vector'], + allocation_gate * write_gate + ) + + # get write weightings + hidden['write_weights'] = self.write_weighting( + hidden['memory'], + write_content_weights, + alloc, + write_gate, + allocation_gate + ) + + weighted_resets = hidden['write_weights'].unsqueeze( + 3) * erase_vector.unsqueeze(2) + reset_gate = T.prod(1 - weighted_resets, 1) + # Update memory + hidden['memory'] = hidden['memory'] * reset_gate + + hidden['memory'] = hidden['memory'] + \ + T.bmm(hidden['write_weights'].transpose(1, 2), write_vector) + + # update link_matrix + hidden['link_matrix'] = self.get_link_matrix( + hidden['link_matrix'], + hidden['write_weights'], + hidden['precedence'] + ) + hidden['precedence'] = self.update_precedence( + hidden['precedence'], hidden['write_weights']) + + return hidden + + def content_weightings(self, memory, keys, strengths): + # if memory.size(0) > keys.size(0): + # memory = memory[:keys.size(0), :, :] + d = θ(memory, keys) + return σ(d * strengths.unsqueeze(2), 2) + + def directional_weightings(self, link_matrix, read_weights): + rw = read_weights.unsqueeze(1) + + f = T.matmul(link_matrix, rw.transpose(2, 3)).transpose(2, 3) + b = T.matmul(rw, link_matrix) + return f.transpose(1, 2), b.transpose(1, 2) + + def read_weightings(self, memory, content_weights, link_matrix, read_modes, read_weights): + forward_weight, backward_weight = self.directional_weightings( + link_matrix, read_weights) + + content_mode = read_modes[:, :, 2].contiguous( + ).unsqueeze(2) * content_weights + backward_mode = T.sum( + read_modes[:, :, 0:1].contiguous().unsqueeze(3) * backward_weight, 2) + forward_mode = T.sum( + read_modes[:, :, 1:2].contiguous().unsqueeze(3) * forward_weight, 2) + + return backward_mode + content_mode + forward_mode + + def read_vectors(self, memory, read_weights): + return T.bmm(read_weights, memory) + + def read(self, read_keys, read_strengths, read_modes, hidden): + content_weights = self.content_weightings( + hidden['memory'], read_keys, read_strengths) + + hidden['read_weights'] = self.read_weightings( + hidden['memory'], + content_weights, + hidden['link_matrix'], + read_modes, + hidden['read_weights'] + ) + read_vectors = self.read_vectors( + hidden['memory'], hidden['read_weights']) + return read_vectors, hidden + + def forward(self, ξ, hidden): + + # ξ = ξ.detach() + m = self.mem_size + w = self.cell_size + r = self.read_heads + b = ξ.size()[0] + + if self.independent_linears: + # r read keys (b * r * w) + read_keys = self.read_keys_transform(ξ).view(b, r, w) + # r read strengths (b * r) + read_strengths = F.softplus( + self.read_strengths_transform(ξ).view(b, r)) + # write key (b * 1 * w) + write_key = self.write_key_transform(ξ).view(b, 1, w) + # write strength (b * 1) + write_strength = F.softplus( + self.write_strength_transform(ξ).view(b, 1)) + # erase vector (b * 1 * w) + erase_vector = T.sigmoid( + self.erase_vector_transform(ξ).view(b, 1, w)) + # write vector (b * 1 * w) + write_vector = self.write_vector_transform(ξ).view(b, 1, w) + # r free gates (b * r) + free_gates = T.sigmoid(self.free_gates_transform(ξ).view(b, r)) + # allocation gate (b * 1) + allocation_gate = T.sigmoid( + self.allocation_gate_transform(ξ).view(b, 1)) + # write gate (b * 1) + write_gate = T.sigmoid(self.write_gate_transform(ξ).view(b, 1)) + # read modes (b * r * 3) + read_modes = σ(self.read_modes_transform(ξ).view(b, r, 3), -1) + else: + ξ = self.interface_weights(ξ) + # r read keys (b * w * r) + read_keys = ξ[:, :r * w].contiguous().view(b, r, w) + # r read strengths (b * r) + read_strengths = F.softplus( + ξ[:, r * w:r * w + r].contiguous().view(b, r)) + # write key (b * w * 1) + write_key = ξ[:, r * w + r:r * w + r + w].contiguous().view(b, 1, w) + # write strength (b * 1) + write_strength = F.softplus( + ξ[:, r * w + r + w].contiguous().view(b, 1)) + # erase vector (b * w) + erase_vector = T.sigmoid( + ξ[:, r * w + r + w + 1: r * w + r + 2 * w + 1].contiguous().view(b, 1, w)) + # write vector (b * w) + write_vector = ξ[:, r * w + r + 2 * w + 1: r * w + r + 3 * w + 1].contiguous().view(b, 1, w) + # r free gates (b * r) + free_gates = T.sigmoid( + ξ[:, r * w + r + 3 * w + 1: r * w + 2 * r + 3 * w + 1].contiguous().view(b, r)) + # allocation gate (b * 1) + allocation_gate = T.sigmoid( + ξ[:, r * w + 2 * r + 3 * w + 1].contiguous().unsqueeze(1).view(b, 1)) + # write gate (b * 1) + write_gate = T.sigmoid( + ξ[:, r * w + 2 * r + 3 * w + 2].contiguous()).unsqueeze(1).view(b, 1) + # read modes (b * 3*r) + read_modes = σ(ξ[:, r * w + 2 * r + 3 * w + 3: r * + w + 5 * r + 3 * w + 3].contiguous().view(b, r, 3), -1) + + hidden = self.write(write_key, write_vector, erase_vector, free_gates, + read_strengths, write_strength, write_gate, allocation_gate, hidden) + hidden["free_gates"] = free_gates.clone().detach() + hidden["allocation_gate"] = allocation_gate.clone().detach() + hidden["write_gate"] = write_gate.clone().detach() + hidden["read_modes"] = read_modes.clone().detach() + + return self.read(read_keys, read_strengths, read_modes, hidden) diff --git a/core/model/net.py b/core/model/net.py new file mode 100644 index 0000000..d8475ee --- /dev/null +++ b/core/model/net.py @@ -0,0 +1,501 @@ +# -------------------------------------------------------- +# mcan-vqa (Deep Modular Co-Attention Networks) +# Licensed under The MIT License [see LICENSE for details] +# Written by Yuhao Cui https://github.com/cuiyuhao1996 +# -------------------------------------------------------- + +from core.model.net_utils import FC, MLP, LayerNorm +from core.model.mca import SA, MCA_ED, VLC +from core.model.dnc import DNC + +import torch.nn as nn +import torch.nn.functional as F +import torch + +# ------------------------------ +# ---- Flatten the sequence ---- +# ------------------------------ + +class AttFlat(nn.Module): + def __init__(self, __C): + super(AttFlat, self).__init__() + self.__C = __C + + self.mlp = MLP( + in_size=__C.HIDDEN_SIZE, + mid_size=__C.FLAT_MLP_SIZE, + out_size=__C.FLAT_GLIMPSES, + dropout_r=__C.DROPOUT_R, + use_relu=True + ) + + self.linear_merge = nn.Linear( + __C.HIDDEN_SIZE * __C.FLAT_GLIMPSES, + __C.FLAT_OUT_SIZE + ) + + def forward(self, x, x_mask): + att = self.mlp(x) + att = att.masked_fill( + x_mask.squeeze(1).squeeze(1).unsqueeze(2), + -1e9 + ) + att = F.softmax(att, dim=1) + + att_list = [] + for i in range(self.__C.FLAT_GLIMPSES): + att_list.append( + torch.sum(att[:, :, i: i + 1] * x, dim=1) + ) + + x_atted = torch.cat(att_list, dim=1) + x_atted = self.linear_merge(x_atted) + + return x_atted + +class AttFlatMem(AttFlat): + def __init__(self, __C): + super(AttFlatMem, self).__init__(__C) + self.__C = __C + + def forward(self, x_mem, x, x_mask): + att = self.mlp(x_mem) + att = att.masked_fill( + x_mask.squeeze(1).squeeze(1).unsqueeze(2), + float('-inf') + ) + att = F.softmax(att, dim=1) + att_list = [] + for i in range(self.__C.FLAT_GLIMPSES): + att_list.append( + torch.sum(att[:, :, i: i + 1] * x, dim=1) + ) + x_atted = torch.cat(att_list, dim=1) + x_atted = self.linear_merge(x_atted) + + return x_atted +# ------------------------- +# ---- Main MCAN Model ---- +# ------------------------- + +class Net1(nn.Module): + def __init__(self, __C, pretrained_emb, token_size, answer_size): + super(Net1, self).__init__() + print('Training with Network type 1: VLCN') + self.pretrained_path = __C.PRETRAINED_PATH + self.embedding = nn.Embedding( + num_embeddings=token_size, + embedding_dim=__C.WORD_EMBED_SIZE + ) + + # Loading the GloVe embedding weights + if __C.USE_GLOVE: + self.embedding.weight.data.copy_(torch.from_numpy(pretrained_emb)) + + self.lstm = nn.LSTM( + input_size=__C.WORD_EMBED_SIZE, + hidden_size=__C.HIDDEN_SIZE, + num_layers=1, + batch_first=True + ) + + self.frame_feat_linear = nn.Linear( + __C.FRAME_FEAT_SIZE, + __C.HIDDEN_SIZE + ) + + self.clip_feat_linear = nn.Linear( + __C.CLIP_FEAT_SIZE, + __C.HIDDEN_SIZE + ) + self.backbone = VLC(__C) + + self.attflat_lang = AttFlat(__C) + self.attflat_frame = AttFlat(__C) + self.attflat_clip = AttFlat(__C) + + self.dnc = DNC( + __C.FLAT_OUT_SIZE, + __C.FLAT_OUT_SIZE, + rnn_type='lstm', + num_layers=2, + num_hidden_layers=2, + bias=True, + batch_first=True, + dropout=0, + bidirectional=True, + nr_cells=__C.CELL_COUNT_DNC, + read_heads=__C.N_READ_HEADS_DNC, + cell_size=__C.WORD_LENGTH_DNC, + nonlinearity='tanh', + gpu_id=0, + independent_linears=False, + share_memory=False, + debug=False, + clip=20, + ) + + self.proj_norm = LayerNorm(__C.FLAT_OUT_SIZE) + + self.proj_norm_dnc = LayerNorm(__C.FLAT_OUT_SIZE + __C.N_READ_HEADS_DNC * __C.WORD_LENGTH_DNC) + self.linear_dnc = FC(__C.FLAT_OUT_SIZE + __C.N_READ_HEADS_DNC * __C.WORD_LENGTH_DNC, __C.FLAT_OUT_SIZE, dropout_r=0.2) + self.proj = nn.Linear(__C.FLAT_OUT_SIZE, answer_size) + + def forward(self, frame_feat, clip_feat, ques_ix): + + # Make mask + lang_feat_mask = self.make_mask(ques_ix.unsqueeze(2)) + frame_feat_mask = self.make_mask(frame_feat) + clip_feat_mask = self.make_mask(clip_feat) + + # Pre-process Language Feature + lang_feat = self.embedding(ques_ix) + lang_feat, _ = self.lstm(lang_feat) + + + # Pre-process Video Feature + frame_feat = self.frame_feat_linear(frame_feat) + clip_feat = self.clip_feat_linear(clip_feat) + + # Backbone Framework + lang_feat, frame_feat, clip_feat = self.backbone( + lang_feat, + frame_feat, + clip_feat, + lang_feat_mask, + frame_feat_mask, + clip_feat_mask + ) + + lang_feat = self.attflat_lang( + lang_feat, + lang_feat_mask + ) + + frame_feat = self.attflat_frame( + frame_feat, + frame_feat_mask + ) + + clip_feat = self.attflat_clip( + clip_feat, + clip_feat_mask + ) + proj_feat_0 = lang_feat + frame_feat + clip_feat + proj_feat_0 = self.proj_norm(proj_feat_0) + + proj_feat_1 = torch.stack([lang_feat, frame_feat, clip_feat], dim=1) + proj_feat_1, (_, _, rv), _ = self.dnc(proj_feat_1, (None, None, None), reset_experience=True, pass_through_memory=True) + proj_feat_1 = proj_feat_1.sum(1) + proj_feat_1 = torch.cat([proj_feat_1, rv], dim=-1) + proj_feat_1 = self.proj_norm_dnc(proj_feat_1) + proj_feat_1 = self.linear_dnc(proj_feat_1) + # proj_feat_1 = self.proj_norm(proj_feat_1) + + proj_feat = torch.sigmoid(self.proj(proj_feat_0 + proj_feat_1)) + + return proj_feat + + def load_pretrained_weights(self): + pretrained_msvd = torch.load(self.pretrained_path)['state_dict'] + for n_pretrained, p_pretrained in pretrained_msvd.items(): + if 'dnc' in n_pretrained: + self.state_dict()[n_pretrained].copy_(p_pretrained) + print('Pre-trained dnc-weights successfully loaded!') + + # Masking + def make_mask(self, feature): + return (torch.sum( + torch.abs(feature), + dim=-1 + ) == 0).unsqueeze(1).unsqueeze(2) + +class Net2(nn.Module): + def __init__(self, __C, pretrained_emb, token_size, answer_size): + super(Net2, self).__init__() + print('Training with Network type 2: VLCN-FLF') + self.embedding = nn.Embedding( + num_embeddings=token_size, + embedding_dim=__C.WORD_EMBED_SIZE + ) + # Loading the GloVe embedding weights + if __C.USE_GLOVE: + self.embedding.weight.data.copy_(torch.from_numpy(pretrained_emb)) + + self.lstm = nn.LSTM( + input_size=__C.WORD_EMBED_SIZE, + hidden_size=__C.HIDDEN_SIZE, + num_layers=1, + batch_first=True + ) + + self.frame_feat_linear = nn.Linear( + __C.FRAME_FEAT_SIZE, + __C.HIDDEN_SIZE + ) + + self.clip_feat_linear = nn.Linear( + __C.CLIP_FEAT_SIZE, + __C.HIDDEN_SIZE + ) + self.backbone = VLC(__C) + + self.attflat_lang = AttFlat(__C) + self.attflat_frame = AttFlat(__C) + self.attflat_clip = AttFlat(__C) + + self.proj_norm = LayerNorm(__C.FLAT_OUT_SIZE) + self.proj = nn.Linear(__C.FLAT_OUT_SIZE, answer_size) + + + def forward(self, frame_feat, clip_feat, ques_ix): + + # Make mask + lang_feat_mask = self.make_mask(ques_ix.unsqueeze(2)) + frame_feat_mask = self.make_mask(frame_feat) + clip_feat_mask = self.make_mask(clip_feat) + + # Pre-process Language Feature + lang_feat = self.embedding(ques_ix) + lang_feat, _ = self.lstm(lang_feat) + + + # Pre-process Video Feature + frame_feat = self.frame_feat_linear(frame_feat) + clip_feat = self.clip_feat_linear(clip_feat) + + # Backbone Framework + lang_feat, frame_feat, clip_feat = self.backbone( + lang_feat, + frame_feat, + clip_feat, + lang_feat_mask, + frame_feat_mask, + clip_feat_mask + ) + + lang_feat = self.attflat_lang( + lang_feat, + lang_feat_mask + ) + + frame_feat = self.attflat_frame( + frame_feat, + frame_feat_mask + ) + + clip_feat = self.attflat_clip( + clip_feat, + clip_feat_mask + ) + proj_feat = lang_feat + frame_feat + clip_feat + proj_feat = self.proj_norm(proj_feat) + proj_feat = torch.sigmoid(self.proj(proj_feat)) + + return proj_feat + # Masking + def make_mask(self, feature): + return (torch.sum( + torch.abs(feature), + dim=-1 + ) == 0).unsqueeze(1).unsqueeze(2) + +class Net3(nn.Module): + def __init__(self, __C, pretrained_emb, token_size, answer_size): + super(Net3, self).__init__() + print('Training with Network type 3: VLCN+LSTM') + + self.embedding = nn.Embedding( + num_embeddings=token_size, + embedding_dim=__C.WORD_EMBED_SIZE + ) + + # Loading the GloVe embedding weights + if __C.USE_GLOVE: + self.embedding.weight.data.copy_(torch.from_numpy(pretrained_emb)) + + self.lstm = nn.LSTM( + input_size=__C.WORD_EMBED_SIZE, + hidden_size=__C.HIDDEN_SIZE, + num_layers=1, + batch_first=True + ) + + self.frame_feat_linear = nn.Linear( + __C.FRAME_FEAT_SIZE, + __C.HIDDEN_SIZE + ) + + self.clip_feat_linear = nn.Linear( + __C.CLIP_FEAT_SIZE, + __C.HIDDEN_SIZE + ) + self.backbone = VLC(__C) + + self.attflat_lang = AttFlat(__C) + self.attflat_frame = AttFlat(__C) + self.attflat_clip = AttFlat(__C) + + self.lstm_fusion = nn.LSTM( + input_size=__C.FLAT_OUT_SIZE, + hidden_size=__C.FLAT_OUT_SIZE, + num_layers=2, + batch_first=True, + bidirectional=True + ) + + self.proj_norm = LayerNorm(__C.FLAT_OUT_SIZE) + self.proj_feat_1 = nn.Linear(__C.FLAT_OUT_SIZE * 2, __C.FLAT_OUT_SIZE) + + self.proj_norm_lstm = LayerNorm(__C.FLAT_OUT_SIZE) + self.proj = nn.Linear(__C.FLAT_OUT_SIZE, answer_size) + + def forward(self, frame_feat, clip_feat, ques_ix): + + # Make mask + lang_feat_mask = self.make_mask(ques_ix.unsqueeze(2)) + frame_feat_mask = self.make_mask(frame_feat) + clip_feat_mask = self.make_mask(clip_feat) + + # Pre-process Language Feature + lang_feat = self.embedding(ques_ix) + lang_feat, _ = self.lstm(lang_feat) + + + # Pre-process Video Feature + frame_feat = self.frame_feat_linear(frame_feat) + clip_feat = self.clip_feat_linear(clip_feat) + + # Backbone Framework + lang_feat, frame_feat, clip_feat = self.backbone( + lang_feat, + frame_feat, + clip_feat, + lang_feat_mask, + frame_feat_mask, + clip_feat_mask + ) + + lang_feat = self.attflat_lang( + lang_feat, + lang_feat_mask + ) + + frame_feat = self.attflat_frame( + frame_feat, + frame_feat_mask + ) + + clip_feat = self.attflat_clip( + clip_feat, + clip_feat_mask + ) + proj_feat_0 = lang_feat + frame_feat + clip_feat + proj_feat_0 = self.proj_norm(proj_feat_0) + + proj_feat_1 = torch.stack([lang_feat, frame_feat, clip_feat], dim=1) + proj_feat_1, _ = self.lstm_fusion(proj_feat_1) + proj_feat_1 = proj_feat_1.sum(1) + proj_feat_1 = self.proj_feat_1(proj_feat_1) + proj_feat_1 = self.proj_norm_lstm(proj_feat_1) + + proj_feat = torch.sigmoid(self.proj(proj_feat_0 + proj_feat_1)) + + return proj_feat + + # Masking + def make_mask(self, feature): + return (torch.sum( + torch.abs(feature), + dim=-1 + ) == 0).unsqueeze(1).unsqueeze(2) + +class Net4(nn.Module): + def __init__(self, __C, pretrained_emb, token_size, answer_size): + super(Net4, self).__init__() + print('Training with Network type 4: MCAN') + self.embedding = nn.Embedding( + num_embeddings=token_size, + embedding_dim=__C.WORD_EMBED_SIZE + ) + + # Loading the GloVe embedding weights + if __C.USE_GLOVE: + self.embedding.weight.data.copy_(torch.from_numpy(pretrained_emb)) + + self.lstm = nn.LSTM( + input_size=__C.WORD_EMBED_SIZE, + hidden_size=__C.HIDDEN_SIZE, + num_layers=1, + batch_first=True + ) + + self.frame_feat_linear = nn.Linear( + __C.FRAME_FEAT_SIZE, + __C.HIDDEN_SIZE + ) + + self.clip_feat_linear = nn.Linear( + __C.CLIP_FEAT_SIZE, + __C.HIDDEN_SIZE + ) + self.backbone = MCA_ED(__C) + + self.attflat_lang = AttFlat(__C) + self.attflat_vid = AttFlat(__C) + + self.proj_norm = LayerNorm(__C.FLAT_OUT_SIZE) + self.proj = nn.Linear(__C.FLAT_OUT_SIZE, answer_size) + + + def forward(self, frame_feat, clip_feat, ques_ix): + + # Make mask + lang_feat_mask = self.make_mask(ques_ix.unsqueeze(2)) + frame_feat_mask = self.make_mask(frame_feat) + clip_feat_mask = self.make_mask(clip_feat) + + # Pre-process Language Feature + lang_feat = self.embedding(ques_ix) + lang_feat, _ = self.lstm(lang_feat) + + + # Pre-process Video Feature + frame_feat = self.frame_feat_linear(frame_feat) + clip_feat = self.clip_feat_linear(clip_feat) + + # concat frame and clip features + vid_feat = torch.cat([frame_feat, clip_feat], dim=1) + vid_feat_mask = torch.cat([frame_feat_mask, clip_feat_mask], dim=-1) + # Backbone Framework + lang_feat, vid_feat = self.backbone( + lang_feat, + vid_feat, + lang_feat_mask, + vid_feat_mask, + ) + + lang_feat = self.attflat_lang( + lang_feat, + lang_feat_mask + ) + + vid_feat = self.attflat_vid( + vid_feat, + vid_feat_mask + ) + + proj_feat = lang_feat + vid_feat + proj_feat = self.proj_norm(proj_feat) + proj_feat = torch.sigmoid(self.proj(proj_feat)) + + return proj_feat + + # Masking + def make_mask(self, feature): + return (torch.sum( + torch.abs(feature), + dim=-1 + ) == 0).unsqueeze(1).unsqueeze(2) + + diff --git a/core/model/net_utils.py b/core/model/net_utils.py new file mode 100644 index 0000000..822edd0 --- /dev/null +++ b/core/model/net_utils.py @@ -0,0 +1,62 @@ +# -------------------------------------------------------- +# mcan-vqa (Deep Modular Co-Attention Networks) +# Licensed under The MIT License [see LICENSE for details] +# Written by Yuhao Cui https://github.com/cuiyuhao1996 +# -------------------------------------------------------- + +import torch.nn as nn +import os +import torch + + +class FC(nn.Module): + def __init__(self, in_size, out_size, dropout_r=0., use_relu=True): + super(FC, self).__init__() + self.dropout_r = dropout_r + self.use_relu = use_relu + + self.linear = nn.Linear(in_size, out_size) + + if use_relu: + self.relu = nn.ReLU(inplace=True) + + if dropout_r > 0: + self.dropout = nn.Dropout(dropout_r) + + def forward(self, x): + x = self.linear(x) + + if self.use_relu: + x = self.relu(x) + + if self.dropout_r > 0: + x = self.dropout(x) + + return x + + +class MLP(nn.Module): + def __init__(self, in_size, mid_size, out_size, dropout_r=0., use_relu=True): + super(MLP, self).__init__() + + self.fc = FC(in_size, mid_size, dropout_r=dropout_r, use_relu=use_relu) + self.linear = nn.Linear(mid_size, out_size) + + def forward(self, x): + return self.linear(self.fc(x)) + + +class LayerNorm(nn.Module): + def __init__(self, size, eps=1e-6): + super(LayerNorm, self).__init__() + self.eps = eps + + self.a_2 = nn.Parameter(torch.ones(size)) + self.b_2 = nn.Parameter(torch.zeros(size)) + + def forward(self, x): + mean = x.mean(-1, keepdim=True) + std = x.std(-1, keepdim=True) + + return self.a_2 * (x - mean) / (std + self.eps) + self.b_2 + diff --git a/core/model/optim.py b/core/model/optim.py new file mode 100644 index 0000000..d01712c --- /dev/null +++ b/core/model/optim.py @@ -0,0 +1,98 @@ +# -------------------------------------------------------- +# mcan-vqa (Deep Modular Co-Attention Networks) +# Licensed under The MIT License [see LICENSE for details] +# Written by Yuhao Cui https://github.com/cuiyuhao1996 +# -------------------------------------------------------- + +import torch +import torch.optim as Optim + + +class WarmupOptimizer(object): + def __init__(self, lr_base, optimizer, data_size, batch_size): + self.optimizer = optimizer + self._step = 0 + self.lr_base = lr_base + self._rate = 0 + self.data_size = data_size + self.batch_size = batch_size + + def step(self): + self._step += 1 + + rate = self.rate() + for p in self.optimizer.param_groups: + p['lr'] = rate + self._rate = rate + + self.optimizer.step() + + + def zero_grad(self): + self.optimizer.zero_grad() + + + def rate(self, step=None): + if step is None: + step = self._step + + if step <= int(self.data_size / self.batch_size * 1): + r = self.lr_base * 1/4. + elif step <= int(self.data_size / self.batch_size * 2): + r = self.lr_base * 2/4. + elif step <= int(self.data_size / self.batch_size * 3): + r = self.lr_base * 3/4. + else: + r = self.lr_base + + return r + + +def get_optim(__C, model, data_size, optimizer, lr_base=None): + if lr_base is None: + lr_base = __C.LR_BASE + + # modules = model._modules + # params_list = [] + # for m in modules: + # if 'dnc' in m: + # params_list.append({ + # 'params': filter(lambda p: p.requires_grad, modules[m].parameters()), + # 'lr': __C.LR_DNC_BASE, + # 'flag': True + # }) + # else: + # params_list.append({ + # 'params': filter(lambda p: p.requires_grad, modules[m].parameters()), + + # }) + if optimizer == 'adam': + optim = Optim.Adam( + filter(lambda p: p.requires_grad, model.parameters()), + lr=0, + betas=__C.OPT_BETAS, + eps=__C.OPT_EPS, + + ) + elif optimizer == 'rmsprop': + optim = Optim.RMSprop( + filter(lambda p: p.requires_grad, model.parameters()), + lr=0, + eps=__C.OPT_EPS, + weight_decay=__C.OPT_WEIGHT_DECAY + ) + else: + raise ValueError('{} optimizer is not supported'.fromat(optimizer)) + return WarmupOptimizer( + lr_base, + optim, + data_size, + __C.BATCH_SIZE + ) + + +def adjust_lr(optim, decay_r): + optim.lr_base *= decay_r + +def adjust_lr_dnc(optim, decay_r): + optim.lr_dnc_base *= decay_r diff --git a/core/model/utils.py b/core/model/utils.py new file mode 100644 index 0000000..8f57508 --- /dev/null +++ b/core/model/utils.py @@ -0,0 +1,163 @@ +""" +PyTorch DNC implementation from +--> +https://github.com/ixaxaar/pytorch-dnc +<-- +""" + +import torch.nn as nn +import torch as T +import torch.nn.functional as F +import numpy as np +import torch +from torch.autograd import Variable +import re +import string + + +def recursiveTrace(obj): + print(type(obj)) + if hasattr(obj, 'grad_fn'): + print(obj.grad_fn) + recursiveTrace(obj.grad_fn) + elif hasattr(obj, 'saved_variables'): + print(obj.requires_grad, len(obj.saved_tensors), len(obj.saved_variables)) + [print(v) for v in obj.saved_variables] + [recursiveTrace(v.grad_fn) for v in obj.saved_variables] + + +def cuda(x, grad=False, gpu_id=-1): + x = x.float() if T.is_tensor(x) else x + if gpu_id == -1: + t = T.FloatTensor(x) + t.requires_grad=grad + return t + else: + t = T.FloatTensor(x.pin_memory()).cuda(gpu_id) + t.requires_grad=grad + return t + + +def cudavec(x, grad=False, gpu_id=-1): + if gpu_id == -1: + t = T.Tensor(T.from_numpy(x)) + t.requires_grad = grad + return t + else: + t = T.Tensor(T.from_numpy(x).pin_memory()).cuda(gpu_id) + t.requires_grad = grad + return t + + +def cudalong(x, grad=False, gpu_id=-1): + if gpu_id == -1: + t = T.LongTensor(T.from_numpy(x.astype(np.long))) + t.requires_grad = grad + return t + else: + t = T.LongTensor(T.from_numpy(x.astype(np.long)).pin_memory()).cuda(gpu_id) + t.requires_grad = grad + return t + + +def θ(a, b, normBy=2): + """Batchwise Cosine similarity + Cosine similarity + Arguments: + a {Tensor} -- A 3D Tensor (b * m * w) + b {Tensor} -- A 3D Tensor (b * r * w) + Returns: + Tensor -- Batchwise cosine similarity (b * r * m) + """ + dot = T.bmm(a, b.transpose(1,2)) + a_norm = T.norm(a, normBy, dim=2).unsqueeze(2) + b_norm = T.norm(b, normBy, dim=2).unsqueeze(1) + cos = dot / (a_norm * b_norm + δ) + return cos.transpose(1,2).contiguous() + + +def σ(input, axis=1): + """Softmax on an axis + Softmax on an axis + Arguments: + input {Tensor} -- input Tensor + Keyword Arguments: + axis {number} -- axis on which to take softmax on (default: {1}) + Returns: + Tensor -- Softmax output Tensor + """ + input_size = input.size() + + trans_input = input.transpose(axis, len(input_size) - 1) + trans_size = trans_input.size() + + input_2d = trans_input.contiguous().view(-1, trans_size[-1]) + soft_max_2d = F.softmax(input_2d, -1) + soft_max_nd = soft_max_2d.view(*trans_size) + return soft_max_nd.transpose(axis, len(input_size) - 1) + +δ = 1e-6 + + +def register_nan_checks(model): + def check_grad(module, grad_input, grad_output): + # print(module) you can add this to see that the hook is called + # print('hook called for ' + str(type(module))) + if any(np.all(np.isnan(gi.data.cpu().numpy())) for gi in grad_input if gi is not None): + print('NaN gradient in grad_input ' + type(module).__name__) + + model.apply(lambda module: module.register_backward_hook(check_grad)) + + +def apply_dict(dic): + for k, v in dic.items(): + apply_var(v, k) + if isinstance(v, nn.Module): + key_list = [a for a in dir(v) if not a.startswith('__')] + for key in key_list: + apply_var(getattr(v, key), key) + for pk, pv in v._parameters.items(): + apply_var(pv, pk) + + +def apply_var(v, k): + if isinstance(v, Variable) and v.requires_grad: + v.register_hook(check_nan_gradient(k)) + + +def check_nan_gradient(name=''): + def f(tensor): + if np.isnan(T.mean(tensor).data.cpu().numpy()): + print('\nnan gradient of {} :'.format(name)) + # print(tensor) + # assert 0, 'nan gradient' + return tensor + return f + +def ptr(tensor): + if T.is_tensor(tensor): + return tensor.storage().data_ptr() + elif hasattr(tensor, 'data'): + return tensor.clone().data.storage().data_ptr() + else: + return tensor + +# TODO: EWW change this shit +def ensure_gpu(tensor, gpu_id): + if "cuda" in str(type(tensor)) and gpu_id != -1: + return tensor.cuda(gpu_id) + elif "cuda" in str(type(tensor)): + return tensor.cpu() + elif "Tensor" in str(type(tensor)) and gpu_id != -1: + return tensor.cuda(gpu_id) + elif "Tensor" in str(type(tensor)): + return tensor + elif type(tensor) is np.ndarray: + return cudavec(tensor, gpu_id=gpu_id).data + else: + return tensor + + +def print_gradient(x, name): + s = "Gradient of " + name + " ----------------------------------" + x.register_hook(lambda y: print(s, y.squeeze())) diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..78cbad9 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,48 @@ +absl-py==0.12.0 +blis==0.7.4 +cachetools==4.2.1 +catalogue==1.0.0 +certifi==2020.12.5 +chardet==4.0.0 +click==7.1.2 +cycler==0.10.0 +cymem==2.0.5 +google-auth==1.28.0 +google-auth-oauthlib==0.4.3 +grpcio==1.36.1 +idna==2.10 +importlib-metadata==3.7.3 +joblib==1.0.1 +Markdown==3.3.4 +mkl-fft==1.3.0 +mkl-random==1.1.1 +mkl-service==2.3.0 +murmurhash==1.0.5 +nltk==3.6.2 +oauthlib==3.1.0 +olefile==0.46 +plac==1.1.3 +positional-encodings==3.0.0 +preshed==3.0.5 +protobuf==3.15.6 +pyasn1==0.4.8 +pyasn1-modules==0.2.8 +PyYAML==5.4.1 +regex==2021.4.4 +requests==2.25.1 +requests-oauthlib==1.3.0 +rsa==4.7.2 +scikit-video==1.1.11 +scipy==1.5.4 +spacy==2.3.5 +srsly==1.0.5 +tensorboard==2.4.1 +tensorboard-plugin-wit==1.8.0 +tensorboardX==2.1 +thinc==7.4.5 +tqdm==4.59.0 +typing-extensions==3.7.4.3 +urllib3==1.26.4 +wasabi==0.8.2 +Werkzeug==1.0.1 +zipp==3.4.1 diff --git a/run.py b/run.py new file mode 100644 index 0000000..606427f --- /dev/null +++ b/run.py @@ -0,0 +1,198 @@ +# -------------------------------------------------------- +# mcan-vqa (Deep Modular Co-Attention Networks) +# Licensed under The MIT License [see LICENSE for details] +# Written by Yuhao Cui https://github.com/cuiyuhao1996 +# -------------------------------------------------------- + +from cfgs.base_cfgs import Cfgs +from core.exec import Execution +import argparse, yaml, os + +def str2bool(v): + if isinstance(v, bool): + return v + if v.lower() in ('yes', 'true', 't', 'y', '1'): + return True + elif v.lower() in ('no', 'false', 'f', 'n', '0'): + return False + else: + raise argparse.ArgumentTypeError('Boolean value expected.') + +def parse_args(): + ''' + Parse input arguments + ''' + parser = argparse.ArgumentParser(description='VLCN Args') + + parser.add_argument('--RUN', dest='RUN_MODE', + default='train', + choices=['train', 'val', 'test'], + help='{train, val, test}', + type=str) # , required=True) + + parser.add_argument('--MODEL', dest='MODEL', + choices=['small', 'large'], + help='{small, large}', + default='small', type=str) + + parser.add_argument('--OPTIM', dest='OPTIM', + choices=['adam', 'rmsprop'], + help='The optimizer', + default='rmsprop', type=str) + + parser.add_argument('--SPLIT', dest='TRAIN_SPLIT', + choices=['train', 'train+val'], + help="set training split, " + "eg.'train', 'train+val'" + "set 'train' can trigger the " + "eval after every epoch", + default='train', + type=str) + + parser.add_argument('--EVAL_EE', dest='EVAL_EVERY_EPOCH', + default=True, + help='set True to evaluate the ' + 'val split when an epoch finished' + "(only work when train with " + "'train' split)", + type=bool) + + parser.add_argument('--SAVE_PRED', dest='TEST_SAVE_PRED', + help='set True to save the ' + 'prediction vectors' + '(only work in testing)', + default=False, + type=bool) + + parser.add_argument('--BS', dest='BATCH_SIZE', + help='batch size during training', + default=64, + type=int) + + parser.add_argument('--MAX_EPOCH', dest='MAX_EPOCH', + default=30, + help='max training epoch', + type=int) + + parser.add_argument('--PRELOAD', dest='PRELOAD', + help='pre-load the features into memory' + 'to increase the I/O speed', + default=False, + type=bool) + + parser.add_argument('--GPU', dest='GPU', + help="gpu select, eg.'0, 1, 2'", + default='0', + type=str) + + parser.add_argument('--SEED', dest='SEED', + help='fix random seed', + default=42, + type=int) + + parser.add_argument('--VERSION', dest='VERSION', + help='version control', + default='1.0.0', + type=str) + + parser.add_argument('--RESUME', dest='RESUME', + default=False, + help='resume training', + type=str2bool) + + parser.add_argument('--CKPT_V', dest='CKPT_VERSION', + help='checkpoint version', + type=str) + + parser.add_argument('--CKPT_E', dest='CKPT_EPOCH', + help='checkpoint epoch', + type=int) + + parser.add_argument('--CKPT_PATH', dest='CKPT_PATH', + help='load checkpoint path, we ' + 'recommend that you use ' + 'CKPT_VERSION and CKPT_EPOCH ' + 'instead', + type=str) + + parser.add_argument('--ACCU', dest='GRAD_ACCU_STEPS', + help='reduce gpu memory usage', + type=int) + + parser.add_argument('--NW', dest='NUM_WORKERS', + help='multithreaded loading', + default=0, + type=int) + + parser.add_argument('--PINM', dest='PIN_MEM', + help='use pin memory', + type=bool) + + parser.add_argument('--VERB', dest='VERBOSE', + help='verbose print', + type=bool) + + parser.add_argument('--DATA_PATH', dest='DATASET_PATH', + default='/projects/abdessaied/data/MSRVTT-QA/', + help='Dataset root path', + type=str) + + parser.add_argument('--EXP_NAME', dest='EXP_NAME', + help='The name of the experiment', + default="test", + type=str) + + parser.add_argument('--DEBUG', dest='DEBUG', + help='Triggeres debug mode: small fractions of the data are loaded ', + default='0', + type=str2bool) + + parser.add_argument('--ENABLE_TIME_MONITORING', dest='ENABLE_TIME_MONITORING', + help='Triggeres time monitoring when training', + default='0', + type=str2bool) + + parser.add_argument('--MODEL_TYPE', dest='MODEL_TYPE', + help='The model type to be used\n 1: VLCN \n 2:VLCN-FLF \n 3: VLCN+LSTM \n 4: MCAN', + default=1, + type=int) + + parser.add_argument('--PRETRAINED_PATH', dest='PRETRAINED_PATH', + help='Pretrained weights on msvd', + default='-', + type=str) + + parser.add_argument('--TEST_EPOCH', dest='TEST_EPOCH', + help='', + default=7, + type=int) + + args = parser.parse_args() + return args + + +if __name__ == '__main__': + args = parse_args() + os.chdir(os.path.dirname(os.path.abspath(__file__))) + __C = Cfgs(args.EXP_NAME, args.DATASET_PATH) + args_dict = __C.parse_to_dict(args) + + cfg_file = "cfgs/{}_model.yml".format(args.MODEL) + with open(cfg_file, 'r') as f: + yaml_dict = yaml.load(f) + + args_dict = {**yaml_dict, **args_dict} + + __C.add_args(args_dict) + __C.proc() + + print('Hyper Parameters:') + print(__C) + + __C.check_path() + os.environ['CUDA_VISIBLE_DEVICES'] = __C.GPU + + execution = Execution(__C) + execution.run(__C.RUN_MODE) + + #execution.run('test', epoch=__C.TEST_EPOCH)