vlcn/code/cfgs/base_cfgs.py
2022-03-30 10:46:35 +02:00

267 lines
7.9 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# --------------------------------------------------------
# mcan-vqa (Deep Modular Co-Attention Networks)
# Licensed under The MIT License [see LICENSE for details]
# Written by Yuhao Cui https://github.com/cuiyuhao1996
# --------------------------------------------------------
from cfgs.path_cfgs import PATH
import os, torch, random
import numpy as np
from types import MethodType
class Cfgs(PATH):
def __init__(self, EXP_NAME, DATASET_PATH):
super(Cfgs, self).__init__(EXP_NAME, DATASET_PATH)
# Set Devices
# If use multi-gpu training, set e.g.'0, 1, 2' instead
self.GPU = '0'
# Set RNG For CPU And GPUs
self.SEED = random.randint(0, 99999999)
# -------------------------
# ---- Version Control ----
# -------------------------
# Define a specific name to start new training
# self.VERSION = 'Anonymous_' + str(self.SEED)
self.VERSION = str(self.SEED)
# Resume training
self.RESUME = False
# Used in Resume training and testing
self.CKPT_VERSION = self.VERSION
self.CKPT_EPOCH = 0
# Absolutely checkpoint path, 'CKPT_VERSION' and 'CKPT_EPOCH' will be overridden
self.CKPT_PATH = None
# Print loss every step
self.VERBOSE = True
# ------------------------------
# ---- Data Provider Params ----
# ------------------------------
# {'train', 'val', 'test'}
self.RUN_MODE = 'train'
# Set True to evaluate offline
self.EVAL_EVERY_EPOCH = True
# # Define the 'train' 'val' 'test' data split
# # (EVAL_EVERY_EPOCH triggered when set {'train': 'train'})
# self.SPLIT = {
# 'train': '',
# 'val': 'val',
# 'test': 'test',
# }
# # A external method to set train split
# self.TRAIN_SPLIT = 'train+val+vg'
# Set True to use pretrained word embedding
# (GloVe: spaCy https://spacy.io/)
self.USE_GLOVE = True
# Word embedding matrix size
# (token size x WORD_EMBED_SIZE)
self.WORD_EMBED_SIZE = 300
# Max length of question sentences
self.MAX_TOKEN = 15
# VGG 4096D features
self.FRAME_FEAT_SIZE = 4096
# C3D 4096D features
self.CLIP_FEAT_SIZE = 4096
self.NUM_ANS = 1000
# Default training batch size: 64
self.BATCH_SIZE = 64
# Multi-thread I/O
self.NUM_WORKERS = 8
# Use pin memory
# (Warning: pin memory can accelerate GPU loading but may
# increase the CPU memory usage when NUM_WORKS is large)
self.PIN_MEM = True
# Large model can not training with batch size 64
# Gradient accumulate can split batch to reduce gpu memory usage
# (Warning: BATCH_SIZE should be divided by GRAD_ACCU_STEPS)
self.GRAD_ACCU_STEPS = 1
# Set 'external': use external shuffle method to implement training shuffle
# Set 'internal': use pytorch dataloader default shuffle method
self.SHUFFLE_MODE = 'external'
# ------------------------
# ---- Network Params ----
# ------------------------
# Model deeps
# (Encoder and Decoder will be same deeps)
self.LAYER = 6
# Model hidden size
# (512 as default, bigger will be a sharp increase of gpu memory usage)
self.HIDDEN_SIZE = 512
# Multi-head number in MCA layers
# (Warning: HIDDEN_SIZE should be divided by MULTI_HEAD)
self.MULTI_HEAD = 8
# Dropout rate for all dropout layers
# (dropout can prevent overfitting [Dropout: a simple way to prevent neural networks from overfitting])
self.DROPOUT_R = 0.1
# MLP size in flatten layers
self.FLAT_MLP_SIZE = 512
# Flatten the last hidden to vector with {n} attention glimpses
self.FLAT_GLIMPSES = 1
self.FLAT_OUT_SIZE = 1024
# --------------------------
# ---- Optimizer Params ----
# --------------------------
# The base learning rate
self.LR_BASE = 0.0001
# Learning rate decay ratio
self.LR_DECAY_R = 0.2
# Learning rate decay at {x, y, z...} epoch
self.LR_DECAY_LIST = [10, 12]
# Max training epoch
self.MAX_EPOCH = 30
# Gradient clip
# (default: -1 means not using)
self.GRAD_NORM_CLIP = -1
# Adam optimizer betas and eps
self.OPT_BETAS = (0.9, 0.98)
self.OPT_EPS = 1e-9
self.OPT_WEIGHT_DECAY = 1e-5
# --------------------------
# ---- DNC Hyper-Params ----
# --------------------------
self.IN_SIZE_DNC = self.HIDDEN_SIZE
self.OUT_SIZE_DNC = self.HIDDEN_SIZE
self.WORD_LENGTH_DNC = 512
self.CELL_COUNT_DNC = 64
self.MEM_HIDDEN_SIZE = self.CELL_COUNT_DNC * self.WORD_LENGTH_DNC
self.N_READ_HEADS_DNC = 4
def parse_to_dict(self, args):
args_dict = {}
for arg in dir(args):
if not arg.startswith('_') and not isinstance(getattr(args, arg), MethodType):
if getattr(args, arg) is not None:
args_dict[arg] = getattr(args, arg)
return args_dict
def add_args(self, args_dict):
for arg in args_dict:
setattr(self, arg, args_dict[arg])
def proc(self):
assert self.RUN_MODE in ['train', 'val', 'test']
# ------------ Devices setup
# os.environ['CUDA_VISIBLE_DEVICES'] = self.GPU
self.N_GPU = len(self.GPU.split(','))
self.DEVICES = [_ for _ in range(self.N_GPU)]
torch.set_num_threads(2)
# ------------ Seed setup
# fix pytorch seed
torch.manual_seed(self.SEED)
if self.N_GPU < 2:
torch.cuda.manual_seed(self.SEED)
else:
torch.cuda.manual_seed_all(self.SEED)
torch.backends.cudnn.deterministic = True
# fix numpy seed
np.random.seed(self.SEED)
# fix random seed
random.seed(self.SEED)
if self.CKPT_PATH is not None:
print('Warning: you are now using CKPT_PATH args, '
'CKPT_VERSION and CKPT_EPOCH will not work')
self.CKPT_VERSION = self.CKPT_PATH.split('/')[-1] + '_' + str(random.randint(0, 99999999))
# ------------ Split setup
self.SPLIT['train'] = self.TRAIN_SPLIT
if 'val' in self.SPLIT['train'].split('+') or self.RUN_MODE not in ['train']:
self.EVAL_EVERY_EPOCH = False
if self.RUN_MODE not in ['test']:
self.TEST_SAVE_PRED = False
# ------------ Gradient accumulate setup
assert self.BATCH_SIZE % self.GRAD_ACCU_STEPS == 0
self.SUB_BATCH_SIZE = int(self.BATCH_SIZE / self.GRAD_ACCU_STEPS)
# Use a small eval batch will reduce gpu memory usage
self.EVAL_BATCH_SIZE = 32
# ------------ Networks setup
# FeedForwardNet size in every MCA layer
self.FF_SIZE = int(self.HIDDEN_SIZE * 4)
self.FF_MEM_SIZE = int()
# A pipe line hidden size in attention compute
assert self.HIDDEN_SIZE % self.MULTI_HEAD == 0
self.HIDDEN_SIZE_HEAD = int(self.HIDDEN_SIZE / self.MULTI_HEAD)
def __str__(self):
for attr in dir(self):
if not attr.startswith('__') and not isinstance(getattr(self, attr), MethodType):
print('{ %-17s }->' % attr, getattr(self, attr))
return ''
def check_path(self):
print('Checking dataset ...')
if not os.path.exists(self.FRAMES):
print(self.FRAMES + 'NOT EXIST')
exit(-1)
if not os.path.exists(self.CLIPS):
print(self.CLIPS + 'NOT EXIST')
exit(-1)
for mode in self.QA_PATH:
if not os.path.exists(self.QA_PATH[mode]):
print(self.QA_PATH[mode] + 'NOT EXIST')
exit(-1)
print('Finished')
print('')