vlcn/code/cfgs/base_cfgs.py

268 lines
7.9 KiB
Python
Raw Normal View History

2022-03-30 10:46:35 +02:00
# --------------------------------------------------------
# mcan-vqa (Deep Modular Co-Attention Networks)
# Licensed under The MIT License [see LICENSE for details]
# Written by Yuhao Cui https://github.com/cuiyuhao1996
# --------------------------------------------------------
from cfgs.path_cfgs import PATH
import os, torch, random
import numpy as np
from types import MethodType
class Cfgs(PATH):
def __init__(self, EXP_NAME, DATASET_PATH):
super(Cfgs, self).__init__(EXP_NAME, DATASET_PATH)
# Set Devices
# If use multi-gpu training, set e.g.'0, 1, 2' instead
self.GPU = '0'
# Set RNG For CPU And GPUs
self.SEED = random.randint(0, 99999999)
# -------------------------
# ---- Version Control ----
# -------------------------
# Define a specific name to start new training
# self.VERSION = 'Anonymous_' + str(self.SEED)
self.VERSION = str(self.SEED)
# Resume training
self.RESUME = False
# Used in Resume training and testing
self.CKPT_VERSION = self.VERSION
self.CKPT_EPOCH = 0
# Absolutely checkpoint path, 'CKPT_VERSION' and 'CKPT_EPOCH' will be overridden
self.CKPT_PATH = None
# Print loss every step
self.VERBOSE = True
# ------------------------------
# ---- Data Provider Params ----
# ------------------------------
# {'train', 'val', 'test'}
self.RUN_MODE = 'train'
# Set True to evaluate offline
self.EVAL_EVERY_EPOCH = True
# # Define the 'train' 'val' 'test' data split
# # (EVAL_EVERY_EPOCH triggered when set {'train': 'train'})
# self.SPLIT = {
# 'train': '',
# 'val': 'val',
# 'test': 'test',
# }
# # A external method to set train split
# self.TRAIN_SPLIT = 'train+val+vg'
# Set True to use pretrained word embedding
# (GloVe: spaCy https://spacy.io/)
self.USE_GLOVE = True
# Word embedding matrix size
# (token size x WORD_EMBED_SIZE)
self.WORD_EMBED_SIZE = 300
# Max length of question sentences
self.MAX_TOKEN = 15
# VGG 4096D features
self.FRAME_FEAT_SIZE = 4096
# C3D 4096D features
self.CLIP_FEAT_SIZE = 4096
self.NUM_ANS = 1000
# Default training batch size: 64
self.BATCH_SIZE = 64
# Multi-thread I/O
self.NUM_WORKERS = 8
# Use pin memory
# (Warning: pin memory can accelerate GPU loading but may
# increase the CPU memory usage when NUM_WORKS is large)
self.PIN_MEM = True
# Large model can not training with batch size 64
# Gradient accumulate can split batch to reduce gpu memory usage
# (Warning: BATCH_SIZE should be divided by GRAD_ACCU_STEPS)
self.GRAD_ACCU_STEPS = 1
# Set 'external': use external shuffle method to implement training shuffle
# Set 'internal': use pytorch dataloader default shuffle method
self.SHUFFLE_MODE = 'external'
# ------------------------
# ---- Network Params ----
# ------------------------
# Model deeps
# (Encoder and Decoder will be same deeps)
self.LAYER = 6
# Model hidden size
# (512 as default, bigger will be a sharp increase of gpu memory usage)
self.HIDDEN_SIZE = 512
# Multi-head number in MCA layers
# (Warning: HIDDEN_SIZE should be divided by MULTI_HEAD)
self.MULTI_HEAD = 8
# Dropout rate for all dropout layers
# (dropout can prevent overfitting [Dropout: a simple way to prevent neural networks from overfitting])
self.DROPOUT_R = 0.1
# MLP size in flatten layers
self.FLAT_MLP_SIZE = 512
# Flatten the last hidden to vector with {n} attention glimpses
self.FLAT_GLIMPSES = 1
self.FLAT_OUT_SIZE = 1024
# --------------------------
# ---- Optimizer Params ----
# --------------------------
# The base learning rate
self.LR_BASE = 0.0001
# Learning rate decay ratio
self.LR_DECAY_R = 0.2
# Learning rate decay at {x, y, z...} epoch
self.LR_DECAY_LIST = [10, 12]
# Max training epoch
self.MAX_EPOCH = 30
# Gradient clip
# (default: -1 means not using)
self.GRAD_NORM_CLIP = -1
# Adam optimizer betas and eps
self.OPT_BETAS = (0.9, 0.98)
self.OPT_EPS = 1e-9
self.OPT_WEIGHT_DECAY = 1e-5
# --------------------------
# ---- DNC Hyper-Params ----
# --------------------------
self.IN_SIZE_DNC = self.HIDDEN_SIZE
self.OUT_SIZE_DNC = self.HIDDEN_SIZE
self.WORD_LENGTH_DNC = 512
self.CELL_COUNT_DNC = 64
self.MEM_HIDDEN_SIZE = self.CELL_COUNT_DNC * self.WORD_LENGTH_DNC
self.N_READ_HEADS_DNC = 4
def parse_to_dict(self, args):
args_dict = {}
for arg in dir(args):
if not arg.startswith('_') and not isinstance(getattr(args, arg), MethodType):
if getattr(args, arg) is not None:
args_dict[arg] = getattr(args, arg)
return args_dict
def add_args(self, args_dict):
for arg in args_dict:
setattr(self, arg, args_dict[arg])
def proc(self):
assert self.RUN_MODE in ['train', 'val', 'test']
# ------------ Devices setup
# os.environ['CUDA_VISIBLE_DEVICES'] = self.GPU
self.N_GPU = len(self.GPU.split(','))
self.DEVICES = [_ for _ in range(self.N_GPU)]
torch.set_num_threads(2)
# ------------ Seed setup
# fix pytorch seed
torch.manual_seed(self.SEED)
if self.N_GPU < 2:
torch.cuda.manual_seed(self.SEED)
else:
torch.cuda.manual_seed_all(self.SEED)
torch.backends.cudnn.deterministic = True
# fix numpy seed
np.random.seed(self.SEED)
# fix random seed
random.seed(self.SEED)
if self.CKPT_PATH is not None:
print('Warning: you are now using CKPT_PATH args, '
'CKPT_VERSION and CKPT_EPOCH will not work')
self.CKPT_VERSION = self.CKPT_PATH.split('/')[-1] + '_' + str(random.randint(0, 99999999))
# ------------ Split setup
self.SPLIT['train'] = self.TRAIN_SPLIT
if 'val' in self.SPLIT['train'].split('+') or self.RUN_MODE not in ['train']:
self.EVAL_EVERY_EPOCH = False
if self.RUN_MODE not in ['test']:
self.TEST_SAVE_PRED = False
# ------------ Gradient accumulate setup
assert self.BATCH_SIZE % self.GRAD_ACCU_STEPS == 0
self.SUB_BATCH_SIZE = int(self.BATCH_SIZE / self.GRAD_ACCU_STEPS)
# Use a small eval batch will reduce gpu memory usage
self.EVAL_BATCH_SIZE = 32
# ------------ Networks setup
# FeedForwardNet size in every MCA layer
self.FF_SIZE = int(self.HIDDEN_SIZE * 4)
self.FF_MEM_SIZE = int()
# A pipe line hidden size in attention compute
assert self.HIDDEN_SIZE % self.MULTI_HEAD == 0
self.HIDDEN_SIZE_HEAD = int(self.HIDDEN_SIZE / self.MULTI_HEAD)
def __str__(self):
for attr in dir(self):
if not attr.startswith('__') and not isinstance(getattr(self, attr), MethodType):
print('{ %-17s }->' % attr, getattr(self, attr))
return ''
def check_path(self):
print('Checking dataset ...')
if not os.path.exists(self.FRAMES):
print(self.FRAMES + 'NOT EXIST')
exit(-1)
if not os.path.exists(self.CLIPS):
print(self.CLIPS + 'NOT EXIST')
exit(-1)
for mode in self.QA_PATH:
if not os.path.exists(self.QA_PATH[mode]):
print(self.QA_PATH[mode] + 'NOT EXIST')
exit(-1)
print('Finished')
print('')