267 lines
7.9 KiB
Python
267 lines
7.9 KiB
Python
# --------------------------------------------------------
|
||
# mcan-vqa (Deep Modular Co-Attention Networks)
|
||
# Licensed under The MIT License [see LICENSE for details]
|
||
# Written by Yuhao Cui https://github.com/cuiyuhao1996
|
||
# --------------------------------------------------------
|
||
|
||
from cfgs.path_cfgs import PATH
|
||
|
||
import os, torch, random
|
||
import numpy as np
|
||
from types import MethodType
|
||
|
||
|
||
class Cfgs(PATH):
|
||
def __init__(self, EXP_NAME, DATASET_PATH):
|
||
super(Cfgs, self).__init__(EXP_NAME, DATASET_PATH)
|
||
|
||
# Set Devices
|
||
# If use multi-gpu training, set e.g.'0, 1, 2' instead
|
||
self.GPU = '0'
|
||
|
||
# Set RNG For CPU And GPUs
|
||
self.SEED = random.randint(0, 99999999)
|
||
|
||
# -------------------------
|
||
# ---- Version Control ----
|
||
# -------------------------
|
||
|
||
# Define a specific name to start new training
|
||
# self.VERSION = 'Anonymous_' + str(self.SEED)
|
||
self.VERSION = str(self.SEED)
|
||
|
||
# Resume training
|
||
self.RESUME = False
|
||
|
||
# Used in Resume training and testing
|
||
self.CKPT_VERSION = self.VERSION
|
||
self.CKPT_EPOCH = 0
|
||
|
||
# Absolutely checkpoint path, 'CKPT_VERSION' and 'CKPT_EPOCH' will be overridden
|
||
self.CKPT_PATH = None
|
||
|
||
# Print loss every step
|
||
self.VERBOSE = True
|
||
|
||
|
||
# ------------------------------
|
||
# ---- Data Provider Params ----
|
||
# ------------------------------
|
||
|
||
# {'train', 'val', 'test'}
|
||
self.RUN_MODE = 'train'
|
||
|
||
# Set True to evaluate offline
|
||
self.EVAL_EVERY_EPOCH = True
|
||
|
||
# # Define the 'train' 'val' 'test' data split
|
||
# # (EVAL_EVERY_EPOCH triggered when set {'train': 'train'})
|
||
# self.SPLIT = {
|
||
# 'train': '',
|
||
# 'val': 'val',
|
||
# 'test': 'test',
|
||
# }
|
||
# # A external method to set train split
|
||
# self.TRAIN_SPLIT = 'train+val+vg'
|
||
|
||
# Set True to use pretrained word embedding
|
||
# (GloVe: spaCy https://spacy.io/)
|
||
self.USE_GLOVE = True
|
||
|
||
# Word embedding matrix size
|
||
# (token size x WORD_EMBED_SIZE)
|
||
self.WORD_EMBED_SIZE = 300
|
||
|
||
# Max length of question sentences
|
||
self.MAX_TOKEN = 15
|
||
|
||
# VGG 4096D features
|
||
self.FRAME_FEAT_SIZE = 4096
|
||
|
||
# C3D 4096D features
|
||
self.CLIP_FEAT_SIZE = 4096
|
||
|
||
self.NUM_ANS = 1000
|
||
|
||
# Default training batch size: 64
|
||
self.BATCH_SIZE = 64
|
||
|
||
# Multi-thread I/O
|
||
self.NUM_WORKERS = 8
|
||
|
||
# Use pin memory
|
||
# (Warning: pin memory can accelerate GPU loading but may
|
||
# increase the CPU memory usage when NUM_WORKS is large)
|
||
self.PIN_MEM = True
|
||
|
||
# Large model can not training with batch size 64
|
||
# Gradient accumulate can split batch to reduce gpu memory usage
|
||
# (Warning: BATCH_SIZE should be divided by GRAD_ACCU_STEPS)
|
||
self.GRAD_ACCU_STEPS = 1
|
||
|
||
# Set 'external': use external shuffle method to implement training shuffle
|
||
# Set 'internal': use pytorch dataloader default shuffle method
|
||
self.SHUFFLE_MODE = 'external'
|
||
|
||
|
||
# ------------------------
|
||
# ---- Network Params ----
|
||
# ------------------------
|
||
|
||
# Model deeps
|
||
# (Encoder and Decoder will be same deeps)
|
||
self.LAYER = 6
|
||
|
||
# Model hidden size
|
||
# (512 as default, bigger will be a sharp increase of gpu memory usage)
|
||
self.HIDDEN_SIZE = 512
|
||
|
||
# Multi-head number in MCA layers
|
||
# (Warning: HIDDEN_SIZE should be divided by MULTI_HEAD)
|
||
self.MULTI_HEAD = 8
|
||
|
||
# Dropout rate for all dropout layers
|
||
# (dropout can prevent overfitting: [Dropout: a simple way to prevent neural networks from overfitting])
|
||
self.DROPOUT_R = 0.1
|
||
|
||
# MLP size in flatten layers
|
||
self.FLAT_MLP_SIZE = 512
|
||
|
||
# Flatten the last hidden to vector with {n} attention glimpses
|
||
self.FLAT_GLIMPSES = 1
|
||
self.FLAT_OUT_SIZE = 1024
|
||
|
||
|
||
# --------------------------
|
||
# ---- Optimizer Params ----
|
||
# --------------------------
|
||
|
||
# The base learning rate
|
||
self.LR_BASE = 0.0001
|
||
|
||
# Learning rate decay ratio
|
||
self.LR_DECAY_R = 0.2
|
||
|
||
# Learning rate decay at {x, y, z...} epoch
|
||
self.LR_DECAY_LIST = [10, 12]
|
||
|
||
# Max training epoch
|
||
self.MAX_EPOCH = 30
|
||
|
||
# Gradient clip
|
||
# (default: -1 means not using)
|
||
self.GRAD_NORM_CLIP = -1
|
||
|
||
# Adam optimizer betas and eps
|
||
self.OPT_BETAS = (0.9, 0.98)
|
||
self.OPT_EPS = 1e-9
|
||
self.OPT_WEIGHT_DECAY = 1e-5
|
||
# --------------------------
|
||
# ---- DNC Hyper-Params ----
|
||
# --------------------------
|
||
self.IN_SIZE_DNC = self.HIDDEN_SIZE
|
||
self.OUT_SIZE_DNC = self.HIDDEN_SIZE
|
||
self.WORD_LENGTH_DNC = 512
|
||
self.CELL_COUNT_DNC = 64
|
||
self.MEM_HIDDEN_SIZE = self.CELL_COUNT_DNC * self.WORD_LENGTH_DNC
|
||
self.N_READ_HEADS_DNC = 4
|
||
|
||
def parse_to_dict(self, args):
|
||
args_dict = {}
|
||
for arg in dir(args):
|
||
if not arg.startswith('_') and not isinstance(getattr(args, arg), MethodType):
|
||
if getattr(args, arg) is not None:
|
||
args_dict[arg] = getattr(args, arg)
|
||
|
||
return args_dict
|
||
|
||
|
||
def add_args(self, args_dict):
|
||
for arg in args_dict:
|
||
setattr(self, arg, args_dict[arg])
|
||
|
||
|
||
def proc(self):
|
||
assert self.RUN_MODE in ['train', 'val', 'test']
|
||
|
||
# ------------ Devices setup
|
||
# os.environ['CUDA_VISIBLE_DEVICES'] = self.GPU
|
||
self.N_GPU = len(self.GPU.split(','))
|
||
self.DEVICES = [_ for _ in range(self.N_GPU)]
|
||
torch.set_num_threads(2)
|
||
|
||
|
||
# ------------ Seed setup
|
||
# fix pytorch seed
|
||
torch.manual_seed(self.SEED)
|
||
if self.N_GPU < 2:
|
||
torch.cuda.manual_seed(self.SEED)
|
||
else:
|
||
torch.cuda.manual_seed_all(self.SEED)
|
||
torch.backends.cudnn.deterministic = True
|
||
|
||
# fix numpy seed
|
||
np.random.seed(self.SEED)
|
||
|
||
# fix random seed
|
||
random.seed(self.SEED)
|
||
|
||
if self.CKPT_PATH is not None:
|
||
print('Warning: you are now using CKPT_PATH args, '
|
||
'CKPT_VERSION and CKPT_EPOCH will not work')
|
||
self.CKPT_VERSION = self.CKPT_PATH.split('/')[-1] + '_' + str(random.randint(0, 99999999))
|
||
|
||
|
||
# ------------ Split setup
|
||
self.SPLIT['train'] = self.TRAIN_SPLIT
|
||
if 'val' in self.SPLIT['train'].split('+') or self.RUN_MODE not in ['train']:
|
||
self.EVAL_EVERY_EPOCH = False
|
||
|
||
if self.RUN_MODE not in ['test']:
|
||
self.TEST_SAVE_PRED = False
|
||
|
||
|
||
# ------------ Gradient accumulate setup
|
||
assert self.BATCH_SIZE % self.GRAD_ACCU_STEPS == 0
|
||
self.SUB_BATCH_SIZE = int(self.BATCH_SIZE / self.GRAD_ACCU_STEPS)
|
||
|
||
# Use a small eval batch will reduce gpu memory usage
|
||
self.EVAL_BATCH_SIZE = 32
|
||
|
||
|
||
# ------------ Networks setup
|
||
# FeedForwardNet size in every MCA layer
|
||
self.FF_SIZE = int(self.HIDDEN_SIZE * 4)
|
||
self.FF_MEM_SIZE = int()
|
||
|
||
# A pipe line hidden size in attention compute
|
||
assert self.HIDDEN_SIZE % self.MULTI_HEAD == 0
|
||
self.HIDDEN_SIZE_HEAD = int(self.HIDDEN_SIZE / self.MULTI_HEAD)
|
||
|
||
|
||
def __str__(self):
|
||
for attr in dir(self):
|
||
if not attr.startswith('__') and not isinstance(getattr(self, attr), MethodType):
|
||
print('{ %-17s }->' % attr, getattr(self, attr))
|
||
|
||
return ''
|
||
|
||
def check_path(self):
|
||
print('Checking dataset ...')
|
||
|
||
|
||
if not os.path.exists(self.FRAMES):
|
||
print(self.FRAMES + 'NOT EXIST')
|
||
exit(-1)
|
||
|
||
if not os.path.exists(self.CLIPS):
|
||
print(self.CLIPS + 'NOT EXIST')
|
||
exit(-1)
|
||
|
||
for mode in self.QA_PATH:
|
||
if not os.path.exists(self.QA_PATH[mode]):
|
||
print(self.QA_PATH[mode] + 'NOT EXIST')
|
||
exit(-1)
|
||
|
||
print('Finished')
|
||
print('')
|