Initial commit

This commit is contained in:
Adnen Abdessaied 2022-03-30 10:46:35 +02:00
commit b5f3b728c3
53 changed files with 7008 additions and 0 deletions

0
code/core/.gitkeep Normal file
View file

0
code/core/data/.gitkeep Normal file
View file

103
code/core/data/dataset.py Normal file
View file

@ -0,0 +1,103 @@
import glob, os, json, pickle
import numpy as np
from collections import defaultdict
import torch
from torch.utils.data import Dataset
import torchvision.transforms as transforms
from core.data.utils import tokenize, ans_stat, proc_ques, qlen_to_key, ans_to_key
class VideoQA_Dataset(Dataset):
def __init__(self, __C):
super(VideoQA_Dataset, self).__init__()
self.__C = __C
self.ans_size = __C.NUM_ANS
# load raw data
with open(__C.QA_PATH[__C.RUN_MODE], 'r') as f:
self.raw_data = json.load(f)
self.data_size = len(self.raw_data)
splits = __C.SPLIT[__C.RUN_MODE].split('+')
frames_list = glob.glob(__C.FRAMES + '*.pt')
clips_list = glob.glob(__C.CLIPS + '*.pt')
if 'msvd' in self.C.DATASET_PATH.lower():
vid_ids = [int(s.split('/')[-1].split('.')[0][3:]) for s in frames_list]
else:
vid_ids = [int(s.split('/')[-1].split('.')[0][5:]) for s in frames_list]
self.frames_dict = {k: v for (k,v) in zip(vid_ids, frames_list)}
self.clips_dict = {k: v for (k,v) in zip(vid_ids, clips_list)}
del frames_list, clips_list
q_list = []
a_list = []
a_dict = defaultdict(lambda: 0)
for split in ['train', 'val']:
with open(__C.QA_PATH[split], 'r') as f:
qa_data = json.load(f)
for d in qa_data:
q_list.append(d['question'])
a_list = d['answer']
if d['answer'] not in a_dict:
a_dict[d['answer']] = 1
else:
a_dict[d['answer']] += 1
top_answers = sorted(a_dict, key=a_dict.get, reverse=True)
self.qlen_bins_to_idx = {
'1-3': 0,
'4-8': 1,
'9-15': 2,
}
self.ans_rare_to_idx = {
'0-99': 0,
'100-299': 1,
'300-999': 2,
}
self.qtypes_to_idx = {
'what': 0,
'who': 1,
'how': 2,
'when': 3,
'where': 4,
}
if __C.RUN_MODE == 'train':
self.ans_list = top_answers[:self.ans_size]
self.ans_to_ix, self.ix_to_ans = ans_stat(self.ans_list)
self.token_to_ix, self.pretrained_emb = tokenize(q_list, __C.USE_GLOVE)
self.token_size = self.token_to_ix.__len__()
print('== Question token vocab size:', self.token_size)
self.idx_to_qtypes = {v: k for (k, v) in self.qtypes_to_idx.items()}
self.idx_to_qlen_bins = {v: k for (k, v) in self.qlen_bins_to_idx.items()}
self.idx_to_ans_rare = {v: k for (k, v) in self.ans_rare_to_idx.items()}
def __getitem__(self, idx):
sample = self.raw_data[idx]
ques = sample['question']
q_type = self.qtypes_to_idx[ques.split(' ')[0]]
ques_idx, qlen, _ = proc_ques(ques, self.token_to_ix, self.__C.MAX_TOKEN)
qlen_bin = self.qlen_bins_to_idx[qlen_to_key(qlen)]
answer = sample['answer']
answer = self.ans_to_ix.get(answer, np.random.randint(0, high=len(self.ans_list)))
ans_rarity = self.ans_rare_to_idx[ans_to_key(answer)]
answer_one_hot = torch.zeros(self.ans_size)
answer_one_hot[answer] = 1.0
vid_id = sample['video_id']
frames = torch.load(open(self.frames_dict[vid_id], 'rb')).cpu()
clips = torch.load(open(self.clips_dict[vid_id], 'rb')).cpu()
return torch.from_numpy(ques_idx).long(), frames, clips, answer_one_hot, torch.tensor(answer).long(), \
torch.tensor(q_type).long(), torch.tensor(qlen_bin).long(), torch.tensor(ans_rarity).long()
def __len__(self):
return self.data_size

View file

@ -0,0 +1,182 @@
import os
import sys
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
import skvideo.io as skv
import torch
import pickle
from PIL import Image
import tqdm
import numpy as np
from model.C3D import C3D
import json
from torchvision.models import vgg19
import torchvision.transforms as transforms
import torch.nn as nn
import argparse
def _select_frames(path, frame_num):
"""Select representative frames for video.
Ignore some frames both at begin and end of video.
Args:
path: Path of video.
Returns:
frames: list of frames.
"""
frames = list()
video_data = skv.vread(path)
total_frames = video_data.shape[0]
# Ignore some frame at begin and end.
for i in np.linspace(0, total_frames, frame_num + 2)[1:frame_num + 1]:
frame_data = video_data[int(i)]
img = Image.fromarray(frame_data)
img = img.resize((224, 224), Image.BILINEAR)
frame_data = np.array(img)
frames.append(frame_data)
return frames
def _select_clips(path, clip_num):
"""Select self.batch_size clips for video. Each clip has 16 frames.
Args:
path: Path of video.
Returns:
clips: list of clips.
"""
clips = list()
# video_info = skvideo.io.ffprobe(path)
video_data = skv.vread(path)
total_frames = video_data.shape[0]
height = video_data[1]
width = video_data.shape[2]
for i in np.linspace(0, total_frames, clip_num + 2)[1:clip_num + 1]:
# Select center frame first, then include surrounding frames
clip_start = int(i) - 8
clip_end = int(i) + 8
if clip_start < 0:
clip_end = clip_end - clip_start
clip_start = 0
if clip_end > total_frames:
clip_start = clip_start - (clip_end - total_frames)
clip_end = total_frames
clip = video_data[clip_start:clip_end]
new_clip = []
for j in range(16):
frame_data = clip[j]
img = Image.fromarray(frame_data)
img = img.resize((112, 112), Image.BILINEAR)
frame_data = np.array(img) * 1.0
# frame_data -= self.mean[j]
new_clip.append(frame_data)
clips.append(new_clip)
return clips
def preprocess_videos(video_dir, frame_num, clip_num):
frames_dir = os.path.join(os.path.dirname(video_dir), 'frames')
os.mkdir(frames_dir)
clips_dir = os.path.join(os.path.dirname(video_dir), 'clips')
os.mkdir(clips_dir)
for video_name in tqdm.tqdm(os.listdir(video_dir)):
video_path = os.path.join(video_dir, video_name)
frames = _select_frames(video_path, frame_num)
clips = _select_clips(video_path, clip_num)
with open(os.path.join(frames_dir, video_name.split('.')[0] + '.pkl'), "wb") as f:
pickle.dump(frames, f, protocol=pickle.HIGHEST_PROTOCOL)
with open(os.path.join(clips_dir, video_name.split('.')[0] + '.pkl'), "wb") as f:
pickle.dump(clips, f, protocol=pickle.HIGHEST_PROTOCOL)
def generate_video_features(path_frames, path_clips, c3d_path):
device = torch.device('cuda:0')
frame_feat_dir = os.path.join(os.path.dirname(path_frames), 'frame_feat')
os.makedirs(frame_feat_dir, exist_ok=True)
clip_feat_dir = os.path.join(os.path.dirname(path_frames), 'clip_feat')
os.makedirs(clip_feat_dir, exist_ok=True)
cnn = vgg19(pretrained=True)
in_features = cnn.classifier[-1].in_features
cnn.classifier = nn.Sequential(
*list(cnn.classifier.children())[:-1]) # remove last fc layer
cnn.to(device).eval()
c3d = C3D()
c3d.load_state_dict(torch.load(c3d_path))
c3d.to(device).eval()
transform = transforms.Compose([transforms.ToTensor(),
transforms.Normalize((0.485, 0.456, 0.406),
(0.229, 0.224, 0.225))])
for vid_name in tqdm.tqdm(os.listdir(path_frames)):
frame_path = os.path.join(path_frames, vid_name)
clip_path = os.path.join(path_clips, vid_name)
frames = pickle.load(open(frame_path, 'rb'))
clips = pickle.load(open(clip_path, 'rb'))
frames = [transform(f) for f in frames]
frame_feat = []
clip_feat = []
for frame in frames:
with torch.no_grad():
feat = cnn(frame.unsqueeze(0).to(device))
frame_feat.append(feat)
for clip in clips:
# clip has shape (c x f x h x w)
clip = torch.from_numpy(np.float32(np.array(clip)))
clip = clip.transpose(3, 0)
clip = clip.transpose(3, 1)
clip = clip.transpose(3, 2).unsqueeze(0).to(device)
with torch.no_grad():
feat = c3d(clip)
clip_feat.append(feat)
frame_feat = torch.cat(frame_feat, dim=0)
clip_feat = torch.cat(clip_feat, dim=0)
torch.save(frame_feat, os.path.join(frame_feat_dir, vid_name.split('.')[0] + '.pt'))
torch.save(clip_feat, os.path.join(clip_feat_dir, vid_name.split('.')[0] + '.pt'))
def parse_args():
'''
Parse input arguments
'''
parser = argparse.ArgumentParser(description='Preprocessing Args')
parser.add_argument('--RAW_VID_PATH', dest='RAW_VID_PATH',
help='The path to the raw videos',
required=True,
type=str)
parser.add_argument('--FRAMES_OUTPUT_DIR', dest='FRAMES_OUTPUT_DIR',
help='The directory where the processed frames and their features will be stored',
required=True,
type=str)
parser.add_argument('--CLIPS_OUTPUT_DIR', dest='FRAMES_OUTPUT_DIR',
help='The directory where the processed frames and their features will be stored',
required=True,
type=str)
parser.add_argument('--C3D_PATH', dest='C3D_PATH',
help='Pretrained C3D path',
required=True,
type=str)
parser.add_argument('--NUM_SAMPLES', dest='NUM_SAMPLES',
help='The number of frames/clips to be sampled from the video',
default=20,
type=int)
args = parser.parse_args()
return args
if __name__ == '__main__':
args = parse_args()
preprocess_videos(args.RAW_VID_PATH, args.NUM_SAMPLES, args.NUM_SAMPLES)
frames_dir = os.path.join(os.path.dirname(args.RAW_VID_PATH), 'frames')
clips_dir = os.path.join(os.path.dirname(args.RAW_VID_PATH), 'clips')
generate_video_features(frames_dir, clips_dir)

81
code/core/data/utils.py Normal file
View file

@ -0,0 +1,81 @@
import en_vectors_web_lg, random, re, json
import numpy as np
def tokenize(ques_list, use_glove):
token_to_ix = {
'PAD': 0,
'UNK': 1,
}
spacy_tool = None
pretrained_emb = []
if use_glove:
spacy_tool = en_vectors_web_lg.load()
pretrained_emb.append(spacy_tool('PAD').vector)
pretrained_emb.append(spacy_tool('UNK').vector)
for ques in ques_list:
words = re.sub(
r"([.,'!?\"()*#:;])",
'',
ques.lower()
).replace('-', ' ').replace('/', ' ').split()
for word in words:
if word not in token_to_ix:
token_to_ix[word] = len(token_to_ix)
if use_glove:
pretrained_emb.append(spacy_tool(word).vector)
pretrained_emb = np.array(pretrained_emb)
return token_to_ix, pretrained_emb
def proc_ques(ques, token_to_ix, max_token):
ques_ix = np.zeros(max_token, np.int64)
words = re.sub(
r"([.,'!?\"()*#:;])",
'',
ques.lower()
).replace('-', ' ').replace('/', ' ').split()
q_len = 0
for ix, word in enumerate(words):
if word in token_to_ix:
ques_ix[ix] = token_to_ix[word]
q_len += 1
else:
ques_ix[ix] = token_to_ix['UNK']
if ix + 1 == max_token:
break
return ques_ix, q_len, len(words)
def ans_stat(ans_list):
ans_to_ix, ix_to_ans = {}, {}
for i, ans in enumerate(ans_list):
ans_to_ix[ans] = i
ix_to_ans[i] = ans
return ans_to_ix, ix_to_ans
def shuffle_list(ans_list):
random.shuffle(ans_list)
def qlen_to_key(q_len):
if 1<= q_len <=3:
return '1-3'
if 4<= q_len <=8:
return '4-8'
if 9<= q_len:
return '9-15'
def ans_to_key(ans_idx):
if 0 <= ans_idx <= 99 :
return '0-99'
if 100 <= ans_idx <= 299 :
return '100-299'
if 300 <= ans_idx <= 999 :
return '300-999'

523
code/core/exec.py Normal file
View file

@ -0,0 +1,523 @@
# --------------------------------------------------------
# mcan-vqa (Deep Modular Co-Attention Networks)
# Licensed under The MIT License [see LICENSE for details]
# Written by Yuhao Cui https://github.com/cuiyuhao1996
# --------------------------------------------------------
from core.data.dataset import VideoQA_Dataset
from core.model.net import Net1, Net2, Net3, Net4
from core.model.optim import get_optim, adjust_lr
from core.metrics import get_acc
from tqdm import tqdm
from core.data.utils import shuffle_list
import os, json, torch, datetime, pickle, copy, shutil, time, math
import numpy as np
import torch.nn as nn
import torch.utils.data as Data
from tensorboardX import SummaryWriter
from torch.autograd import Variable as var
class Execution:
def __init__(self, __C):
self.__C = __C
print('Loading training set ........')
__C_train = copy.deepcopy(self.__C)
setattr(__C_train, 'RUN_MODE', 'train')
self.dataset = VideoQA_Dataset(__C_train)
self.dataset_eval = None
if self.__C.EVAL_EVERY_EPOCH:
__C_eval = copy.deepcopy(self.__C)
setattr(__C_eval, 'RUN_MODE', 'val')
print('Loading validation set for per-epoch evaluation ........')
self.dataset_eval = VideoQA_Dataset(__C_eval)
self.dataset_eval.ans_list = self.dataset.ans_list
self.dataset_eval.ans_to_ix, self.dataset_eval.ix_to_ans = self.dataset.ans_to_ix, self.dataset.ix_to_ans
self.dataset_eval.token_to_ix, self.dataset_eval.pretrained_emb = self.dataset.token_to_ix, self.dataset.pretrained_emb
__C_test = copy.deepcopy(self.__C)
setattr(__C_test, 'RUN_MODE', 'test')
self.dataset_test = VideoQA_Dataset(__C_test)
self.dataset_test.ans_list = self.dataset.ans_list
self.dataset_test.ans_to_ix, self.dataset_test.ix_to_ans = self.dataset.ans_to_ix, self.dataset.ix_to_ans
self.dataset_test.token_to_ix, self.dataset_test.pretrained_emb = self.dataset.token_to_ix, self.dataset.pretrained_emb
self.writer = SummaryWriter(self.__C.TB_PATH)
def train(self, dataset, dataset_eval=None):
# Obtain needed information
data_size = dataset.data_size
token_size = dataset.token_size
ans_size = dataset.ans_size
pretrained_emb = dataset.pretrained_emb
net = self.construct_net(self.__C.MODEL_TYPE)
if os.path.isfile(self.__C.PRETRAINED_PATH) and self.__C.MODEL_TYPE == 11:
print('Loading pretrained DNC-weigths')
net.load_pretrained_weights()
net.cuda()
net.train()
# Define the multi-gpu training if needed
if self.__C.N_GPU > 1:
net = nn.DataParallel(net, device_ids=self.__C.DEVICES)
# Define the binary cross entropy loss
# loss_fn = torch.nn.BCELoss(size_average=False).cuda()
loss_fn = torch.nn.BCELoss(reduction='sum').cuda()
# Load checkpoint if resume training
if self.__C.RESUME:
print(' ========== Resume training')
if self.__C.CKPT_PATH is not None:
print('Warning: you are now using CKPT_PATH args, '
'CKPT_VERSION and CKPT_EPOCH will not work')
path = self.__C.CKPT_PATH
else:
path = self.__C.CKPTS_PATH + \
'ckpt_' + self.__C.CKPT_VERSION + \
'/epoch' + str(self.__C.CKPT_EPOCH) + '.pkl'
# Load the network parameters
print('Loading ckpt {}'.format(path))
ckpt = torch.load(path)
print('Finish!')
net.load_state_dict(ckpt['state_dict'])
# Load the optimizer paramters
optim = get_optim(self.__C, net, data_size, ckpt['optim'], lr_base=ckpt['lr_base'])
optim._step = int(data_size / self.__C.BATCH_SIZE * self.__C.CKPT_EPOCH)
optim.optimizer.load_state_dict(ckpt['optimizer'])
start_epoch = self.__C.CKPT_EPOCH
else:
if ('ckpt_' + self.__C.VERSION) in os.listdir(self.__C.CKPTS_PATH):
shutil.rmtree(self.__C.CKPTS_PATH + 'ckpt_' + self.__C.VERSION)
os.mkdir(self.__C.CKPTS_PATH + 'ckpt_' + self.__C.VERSION)
optim = get_optim(self.__C, net, data_size, self.__C.OPTIM)
start_epoch = 0
loss_sum = 0
named_params = list(net.named_parameters())
grad_norm = np.zeros(len(named_params))
# Define multi-thread dataloader
if self.__C.SHUFFLE_MODE in ['external']:
dataloader = Data.DataLoader(
dataset,
batch_size=self.__C.BATCH_SIZE,
shuffle=False,
num_workers=self.__C.NUM_WORKERS,
pin_memory=self.__C.PIN_MEM,
drop_last=True
)
else:
dataloader = Data.DataLoader(
dataset,
batch_size=self.__C.BATCH_SIZE,
shuffle=True,
num_workers=self.__C.NUM_WORKERS,
pin_memory=self.__C.PIN_MEM,
drop_last=True
)
# Training script
for epoch in range(start_epoch, self.__C.MAX_EPOCH):
# Save log information
logfile = open(
self.__C.LOG_PATH +
'log_run_' + self.__C.VERSION + '.txt',
'a+'
)
logfile.write(
'nowTime: ' +
datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') +
'\n'
)
logfile.close()
# Learning Rate Decay
if epoch in self.__C.LR_DECAY_LIST:
adjust_lr(optim, self.__C.LR_DECAY_R)
# Externally shuffle
if self.__C.SHUFFLE_MODE == 'external':
shuffle_list(dataset.ans_list)
time_start = time.time()
# Iteration
for step, (
ques_ix_iter,
frames_feat_iter,
clips_feat_iter,
ans_iter,
_,
_,
_,
_
) in enumerate(dataloader):
ques_ix_iter = ques_ix_iter.cuda()
frames_feat_iter = frames_feat_iter.cuda()
clips_feat_iter = clips_feat_iter.cuda()
ans_iter = ans_iter.cuda()
optim.zero_grad()
for accu_step in range(self.__C.GRAD_ACCU_STEPS):
sub_frames_feat_iter = \
frames_feat_iter[accu_step * self.__C.SUB_BATCH_SIZE:
(accu_step + 1) * self.__C.SUB_BATCH_SIZE]
sub_clips_feat_iter = \
clips_feat_iter[accu_step * self.__C.SUB_BATCH_SIZE:
(accu_step + 1) * self.__C.SUB_BATCH_SIZE]
sub_ques_ix_iter = \
ques_ix_iter[accu_step * self.__C.SUB_BATCH_SIZE:
(accu_step + 1) * self.__C.SUB_BATCH_SIZE]
sub_ans_iter = \
ans_iter[accu_step * self.__C.SUB_BATCH_SIZE:
(accu_step + 1) * self.__C.SUB_BATCH_SIZE]
pred = net(
sub_frames_feat_iter,
sub_clips_feat_iter,
sub_ques_ix_iter
)
loss = loss_fn(pred, sub_ans_iter)
# only mean-reduction needs be divided by grad_accu_steps
# removing this line wouldn't change our results because the speciality of Adam optimizer,
# but would be necessary if you use SGD optimizer.
# loss /= self.__C.GRAD_ACCU_STEPS
# start_backward = time.time()
loss.backward()
if self.__C.VERBOSE:
if dataset_eval is not None:
mode_str = self.__C.SPLIT['train'] + '->' + self.__C.SPLIT['val']
else:
mode_str = self.__C.SPLIT['train'] + '->' + self.__C.SPLIT['test']
# logging
self.writer.add_scalar(
'train/loss',
loss.cpu().data.numpy() / self.__C.SUB_BATCH_SIZE,
global_step=step + epoch * math.ceil(data_size / self.__C.BATCH_SIZE))
self.writer.add_scalar(
'train/lr',
optim._rate,
global_step=step + epoch * math.ceil(data_size / self.__C.BATCH_SIZE))
print("\r[exp_name %s][version %s][epoch %2d][step %4d/%4d][%s] loss: %.4f, lr: %.2e" % (
self.__C.EXP_NAME,
self.__C.VERSION,
epoch + 1,
step,
int(data_size / self.__C.BATCH_SIZE),
mode_str,
loss.cpu().data.numpy() / self.__C.SUB_BATCH_SIZE,
optim._rate,
), end=' ')
# Gradient norm clipping
if self.__C.GRAD_NORM_CLIP > 0:
nn.utils.clip_grad_norm_(
net.parameters(),
self.__C.GRAD_NORM_CLIP
)
# Save the gradient information
for name in range(len(named_params)):
norm_v = torch.norm(named_params[name][1].grad).cpu().data.numpy() \
if named_params[name][1].grad is not None else 0
grad_norm[name] += norm_v * self.__C.GRAD_ACCU_STEPS
optim.step()
time_end = time.time()
print('Finished in {}s'.format(int(time_end-time_start)))
epoch_finish = epoch + 1
# Save checkpoint
state = {
'state_dict': net.state_dict(),
'optimizer': optim.optimizer.state_dict(),
'lr_base': optim.lr_base,
'optim': optim.lr_base, }
torch.save(
state,
self.__C.CKPTS_PATH +
'ckpt_' + self.__C.VERSION +
'/epoch' + str(epoch_finish) +
'.pkl'
)
# Logging
logfile = open(
self.__C.LOG_PATH +
'log_run_' + self.__C.VERSION + '.txt',
'a+'
)
logfile.write(
'epoch = ' + str(epoch_finish) +
' loss = ' + str(loss_sum / data_size) +
'\n' +
'lr = ' + str(optim._rate) +
'\n\n'
)
logfile.close()
# Eval after every epoch
if dataset_eval is not None:
self.eval(
net,
dataset_eval,
self.writer,
epoch,
valid=True,
)
loss_sum = 0
grad_norm = np.zeros(len(named_params))
# Evaluation
def eval(self, net, dataset, writer, epoch, valid=False):
ans_ix_list = []
pred_list = []
q_type_list = []
q_bin_list = []
ans_rarity_list = []
ans_qtype_dict = {'what': [], 'who': [], 'how': [], 'when': [], 'where': []}
pred_qtype_dict = {'what': [], 'who': [], 'how': [], 'when': [], 'where': []}
ans_qlen_bin_dict = {'1-3': [], '4-8': [], '9-15': []}
pred_qlen_bin_dict = {'1-3': [], '4-8': [], '9-15': []}
ans_ans_rarity_dict = {'0-99': [], '100-299': [], '300-999': []}
pred_ans_rarity_dict = {'0-99': [], '100-299': [], '300-999': []}
data_size = dataset.data_size
net.eval()
if self.__C.N_GPU > 1:
net = nn.DataParallel(net, device_ids=self.__C.DEVICES)
dataloader = Data.DataLoader(
dataset,
batch_size=self.__C.EVAL_BATCH_SIZE,
shuffle=False,
num_workers=self.__C.NUM_WORKERS,
pin_memory=True
)
for step, (
ques_ix_iter,
frames_feat_iter,
clips_feat_iter,
_,
ans_iter,
q_type,
qlen_bin,
ans_rarity
) in enumerate(dataloader):
print("\rEvaluation: [step %4d/%4d]" % (
step,
int(data_size / self.__C.EVAL_BATCH_SIZE),
), end=' ')
ques_ix_iter = ques_ix_iter.cuda()
frames_feat_iter = frames_feat_iter.cuda()
clips_feat_iter = clips_feat_iter.cuda()
with torch.no_grad():
pred = net(
frames_feat_iter,
clips_feat_iter,
ques_ix_iter
)
pred_np = pred.cpu().data.numpy()
pred_argmax = np.argmax(pred_np, axis=1)
pred_list.extend(pred_argmax)
ans_ix_list.extend(ans_iter.tolist())
q_type_list.extend(q_type.tolist())
q_bin_list.extend(qlen_bin.tolist())
ans_rarity_list.extend(ans_rarity.tolist())
print('')
assert len(pred_list) == len(ans_ix_list) == len(q_type_list) == len(q_bin_list) == len(ans_rarity_list)
pred_list = [dataset.ix_to_ans[pred] for pred in pred_list]
ans_ix_list = [dataset.ix_to_ans[ans] for ans in ans_ix_list]
# Run validation script
scores_per_qtype = {
'what': {},
'who': {},
'how': {},
'when': {},
'where': {},
}
scores_per_qlen_bin = {
'1-3': {},
'4-8': {},
'9-15': {},
}
scores_ans_rarity_dict = {
'0-99': {},
'100-299': {},
'300-999': {}
}
if valid:
# create vqa object and vqaRes object
for pred, ans, q_type in zip(pred_list, ans_ix_list, q_type_list):
pred_qtype_dict[dataset.idx_to_qtypes[q_type]].append(pred)
ans_qtype_dict[dataset.idx_to_qtypes[q_type]].append(ans)
print('----------------- Computing scores -----------------')
acc = get_acc(ans_ix_list, pred_list)
print('----------------- Overall -----------------')
print('acc: {}'.format(acc))
writer.add_scalar('acc/overall', acc, global_step=epoch)
for q_type in scores_per_qtype:
print('----------------- Computing "{}" q-type scores -----------------'.format(q_type))
# acc, wups_0, wups_1 = get_scores(
# ans_ix_dict[q_type], pred_ix_dict[q_type])
acc = get_acc(ans_qtype_dict[q_type], pred_qtype_dict[q_type])
print('acc: {}'.format(acc))
writer.add_scalar(
'acc/{}'.format(q_type), acc, global_step=epoch)
else:
for pred, ans, q_type, qlen_bin, a_rarity in zip(
pred_list, ans_ix_list, q_type_list, q_bin_list, ans_rarity_list):
pred_qtype_dict[dataset.idx_to_qtypes[q_type]].append(pred)
ans_qtype_dict[dataset.idx_to_qtypes[q_type]].append(ans)
pred_qlen_bin_dict[dataset.idx_to_qlen_bins[qlen_bin]].append(pred)
ans_qlen_bin_dict[dataset.idx_to_qlen_bins[qlen_bin]].append(ans)
pred_ans_rarity_dict[dataset.idx_to_ans_rare[a_rarity]].append(pred)
ans_ans_rarity_dict[dataset.idx_to_ans_rare[a_rarity]].append(ans)
print('----------------- Computing overall scores -----------------')
acc = get_acc(ans_ix_list, pred_list)
print('----------------- Overall -----------------')
print('acc:{}'.format(acc))
print('----------------- Computing q-type scores -----------------')
for q_type in scores_per_qtype:
acc = get_acc(ans_qtype_dict[q_type], pred_qtype_dict[q_type])
print(' {} '.format(q_type))
print('acc:{}'.format(acc))
print('----------------- Computing qlen-bins scores -----------------')
for qlen_bin in scores_per_qlen_bin:
acc = get_acc(ans_qlen_bin_dict[qlen_bin], pred_qlen_bin_dict[qlen_bin])
print(' {} '.format(qlen_bin))
print('acc:{}'.format(acc))
print('----------------- Computing ans-rarity scores -----------------')
for a_rarity in scores_ans_rarity_dict:
acc = get_acc(ans_ans_rarity_dict[a_rarity], pred_ans_rarity_dict[a_rarity])
print(' {} '.format(a_rarity))
print('acc:{}'.format(acc))
net.train()
def construct_net(self, model_type):
if model_type == 1:
net = Net1(
self.__C,
self.dataset.pretrained_emb,
self.dataset.token_size,
self.dataset.ans_size
)
elif model_type == 2:
net = Net2(
self.__C,
self.dataset.pretrained_emb,
self.dataset.token_size,
self.dataset.ans_size
)
elif model_type == 3:
net = Net3(
self.__C,
self.dataset.pretrained_emb,
self.dataset.token_size,
self.dataset.ans_size
)
elif model_type == 4:
net = Net4(
self.__C,
self.dataset.pretrained_emb,
self.dataset.token_size,
self.dataset.ans_size
)
else:
raise ValueError('Net{} is not supported'.format(model_type))
return net
def run(self, run_mode, epoch=None):
self.set_seed(self.__C.SEED)
if run_mode == 'train':
self.empty_log(self.__C.VERSION)
self.train(self.dataset, self.dataset_eval)
elif run_mode == 'val':
self.eval(self.dataset, valid=True)
elif run_mode == 'test':
net = self.construct_net(self.__C.MODEL_TYPE)
assert epoch is not None
path = self.__C.CKPTS_PATH + \
'ckpt_' + self.__C.VERSION + \
'/epoch' + str(epoch) + '.pkl'
print('Loading ckpt {}'.format(path))
state_dict = torch.load(path)['state_dict']
net.load_state_dict(state_dict)
net.cuda()
self.eval(net, self.dataset_test, self.writer, 0)
else:
exit(-1)
def set_seed(self, seed):
"""Sets the seed for reproducibility.
Args:
seed (int): The seed used
"""
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
np.random.seed(seed)
print('\nSeed set to {}...\n'.format(seed))
def empty_log(self, version):
print('Initializing log file ........')
if (os.path.exists(self.__C.LOG_PATH + 'log_run_' + version + '.txt')):
os.remove(self.__C.LOG_PATH + 'log_run_' + version + '.txt')
print('Finished!')
print('')

211
code/core/metrics.py Normal file
View file

@ -0,0 +1,211 @@
"""
Author: Mateusz Malinowski
Email: mmalinow@mpi-inf.mpg.de
The script assumes there are two files
- first file with ground truth answers
- second file with predicted answers
both answers are line-aligned
The script also assumes that answer items are comma separated.
For instance, chair,table,window
It is also a set measure, so not exactly the same as accuracy
even if dirac measure is used since {book,book}=={book}, also {book,chair}={chair,book}
Logs:
05.09.2015 - white spaces surrounding words are stripped away so that {book, chair}={book,chair}
"""
import sys
#import enchant
from numpy import prod
from nltk.corpus import wordnet as wn
from tqdm import tqdm
def file2list(filepath):
with open(filepath,'r') as f:
lines =[k for k in
[k.strip() for k in f.readlines()]
if len(k) > 0]
return lines
def list2file(filepath,mylist):
mylist='\n'.join(mylist)
with open(filepath,'w') as f:
f.writelines(mylist)
def items2list(x):
"""
x - string of comma-separated answer items
"""
return [l.strip() for l in x.split(',')]
def fuzzy_set_membership_measure(x,A,m):
"""
Set membership measure.
x: element
A: set of elements
m: point-wise element-to-element measure m(a,b) ~ similarity(a,b)
This function implments a fuzzy set membership measure:
m(x \in A) = max_{a \in A} m(x,a)}
"""
return 0 if A==[] else max(map(lambda a: m(x,a), A))
def score_it(A,T,m):
"""
A: list of A items
T: list of T items
m: set membership measure
m(a \in A) gives a membership quality of a into A
This function implements a fuzzy accuracy score:
score(A,T) = min{prod_{a \in A} m(a \in T), prod_{t \in T} m(a \in A)}
where A and T are set representations of the answers
and m is a measure
"""
if A==[] and T==[]:
return 1
# print A,T
score_left=0 if A==[] else prod(list(map(lambda a: m(a,T), A)))
score_right=0 if T==[] else prod(list(map(lambda t: m(t,A),T)))
return min(score_left,score_right)
# implementations of different measure functions
def dirac_measure(a,b):
"""
Returns 1 iff a=b and 0 otherwise.
"""
if a==[] or b==[]:
return 0.0
return float(a==b)
def wup_measure(a,b,similarity_threshold=0.925):
"""
Returns Wu-Palmer similarity score.
More specifically, it computes:
max_{x \in interp(a)} max_{y \in interp(b)} wup(x,y)
where interp is a 'interpretation field'
"""
def get_semantic_field(a):
weight = 1.0
semantic_field = wn.synsets(a,pos=wn.NOUN)
return (semantic_field,weight)
def get_stem_word(a):
"""
Sometimes answer has form word\d+:wordid.
If so we return word and downweight
"""
weight = 1.0
return (a,weight)
global_weight=1.0
(a,global_weight_a)=get_stem_word(a)
(b,global_weight_b)=get_stem_word(b)
global_weight = min(global_weight_a,global_weight_b)
if a==b:
# they are the same
return 1.0*global_weight
if a==[] or b==[]:
return 0
interp_a,weight_a = get_semantic_field(a)
interp_b,weight_b = get_semantic_field(b)
if interp_a == [] or interp_b == []:
return 0
# we take the most optimistic interpretation
global_max=0.0
for x in interp_a:
for y in interp_b:
local_score=x.wup_similarity(y)
if local_score > global_max:
global_max=local_score
# we need to use the semantic fields and therefore we downweight
# unless the score is high which indicates both are synonyms
if global_max < similarity_threshold:
interp_weight = 0.1
else:
interp_weight = 1.0
final_score=global_max*weight_a*weight_b*interp_weight*global_weight
return final_score
###
def get_scores(input_gt, input_pred, threshold_0=0.0, threshold_1=0.9):
element_membership_acc=dirac_measure
element_membership_wups_0=lambda x,y: wup_measure(x,y,threshold_0)
element_membership_wups_1=lambda x,y: wup_measure(x,y,threshold_1)
set_membership_acc=\
lambda x,A: fuzzy_set_membership_measure(x,A,element_membership_acc)
set_membership_wups_0=\
lambda x,A: fuzzy_set_membership_measure(x,A,element_membership_wups_0)
set_membership_wups_1=\
lambda x,A: fuzzy_set_membership_measure(x,A,element_membership_wups_1)
score_list_acc = []
score_list_wups_0 = []
score_list_wups_1 = []
pbar = tqdm(zip(input_gt,input_pred))
pbar.set_description('Computing Acc')
for (ta,pa) in pbar:
score_list_acc.append(score_it(items2list(ta),items2list(pa),set_membership_acc))
#final_score=sum(map(lambda x:float(x)/float(len(score_list)),score_list))
final_score_acc=float(sum(score_list_acc))/float(len(score_list_acc))
final_score_acc *= 100.0
pbar = tqdm(zip(input_gt,input_pred))
pbar.set_description('Computing Wups_0.0')
for (ta,pa) in pbar:
score_list_wups_0.append(score_it(items2list(ta),items2list(pa),set_membership_wups_0))
#final_score=sum(map(lambda x:float(x)/float(len(score_list)),score_list))
final_score_wups_0=float(sum(score_list_wups_0))/float(len(score_list_wups_0))
final_score_wups_0 *= 100.0
pbar = tqdm(zip(input_gt,input_pred))
pbar.set_description('Computing Wups_0.9')
for (ta,pa) in pbar:
score_list_wups_1.append(score_it(items2list(ta),items2list(pa),set_membership_wups_1))
#final_score=sum(map(lambda x:float(x)/float(len(score_list)),score_list))
final_score_wups_1=float(sum(score_list_wups_1))/float(len(score_list_wups_1))
final_score_wups_1 *= 100.0
# filtering to obtain the results
#print 'full score:', score_list
# print('accuracy = {0:.2f} | WUPS@{1} = {2:.2f} | WUPS@{3} = {4:.2f}'.format(
# final_score_acc, threshold_0, final_score_wups_0, threshold_1, final_score_wups_1))
return final_score_acc, final_score_wups_0, final_score_wups_1
def get_acc(gts, preds):
sum_correct = 0
assert len(gts) == len(preds)
for gt, pred in zip(gts, preds):
if gt == pred:
sum_correct += 1
acc = 100.0 * float(sum_correct/ len(gts))
return acc

0
code/core/model/.gitkeep Normal file
View file

80
code/core/model/C3D.py Normal file
View file

@ -0,0 +1,80 @@
"""
from https://github.com/DavideA/c3d-pytorch/blob/master/C3D_model.py
"""
import torch.nn as nn
class C3D(nn.Module):
"""
The C3D network as described in [1].
"""
def __init__(self):
super(C3D, self).__init__()
self.conv1 = nn.Conv3d(3, 64, kernel_size=(3, 3, 3), padding=(1, 1, 1))
self.pool1 = nn.MaxPool3d(kernel_size=(1, 2, 2), stride=(1, 2, 2))
self.conv2 = nn.Conv3d(64, 128, kernel_size=(3, 3, 3), padding=(1, 1, 1))
self.pool2 = nn.MaxPool3d(kernel_size=(2, 2, 2), stride=(2, 2, 2))
self.conv3a = nn.Conv3d(128, 256, kernel_size=(3, 3, 3), padding=(1, 1, 1))
self.conv3b = nn.Conv3d(256, 256, kernel_size=(3, 3, 3), padding=(1, 1, 1))
self.pool3 = nn.MaxPool3d(kernel_size=(2, 2, 2), stride=(2, 2, 2))
self.conv4a = nn.Conv3d(256, 512, kernel_size=(3, 3, 3), padding=(1, 1, 1))
self.conv4b = nn.Conv3d(512, 512, kernel_size=(3, 3, 3), padding=(1, 1, 1))
self.pool4 = nn.MaxPool3d(kernel_size=(2, 2, 2), stride=(2, 2, 2))
self.conv5a = nn.Conv3d(512, 512, kernel_size=(3, 3, 3), padding=(1, 1, 1))
self.conv5b = nn.Conv3d(512, 512, kernel_size=(3, 3, 3), padding=(1, 1, 1))
self.pool5 = nn.MaxPool3d(kernel_size=(2, 2, 2), stride=(2, 2, 2), padding=(0, 1, 1))
self.fc6 = nn.Linear(8192, 4096)
self.fc7 = nn.Linear(4096, 4096)
self.fc8 = nn.Linear(4096, 487)
self.dropout = nn.Dropout(p=0.5)
self.relu = nn.ReLU()
self.softmax = nn.Softmax()
def forward(self, x):
h = self.relu(self.conv1(x))
h = self.pool1(h)
h = self.relu(self.conv2(h))
h = self.pool2(h)
h = self.relu(self.conv3a(h))
h = self.relu(self.conv3b(h))
h = self.pool3(h)
h = self.relu(self.conv4a(h))
h = self.relu(self.conv4b(h))
h = self.pool4(h)
h = self.relu(self.conv5a(h))
h = self.relu(self.conv5b(h))
h = self.pool5(h)
h = h.view(-1, 8192)
h = self.relu(self.fc6(h))
h = self.dropout(h)
h = self.relu(self.fc7(h))
# h = self.dropout(h)
# logits = self.fc8(h)
# probs = self.softmax(logits)
return h
"""
References
----------
[1] Tran, Du, et al. "Learning spatiotemporal features with 3d convolutional networks."
Proceedings of the IEEE international conference on computer vision. 2015.
"""

323
code/core/model/dnc.py Normal file
View file

@ -0,0 +1,323 @@
"""
PyTorch DNC implementation from
-->
https://github.com/ixaxaar/pytorch-dnc
<--
"""
# -*- coding: utf-8 -*-
import torch.nn as nn
import torch as T
from torch.autograd import Variable as var
import numpy as np
from torch.nn.utils.rnn import pad_packed_sequence as pad
from torch.nn.utils.rnn import pack_padded_sequence as pack
from torch.nn.utils.rnn import PackedSequence
from .util import *
from .memory import *
from torch.nn.init import orthogonal_, xavier_uniform_
class DNC(nn.Module):
def __init__(
self,
input_size,
hidden_size,
rnn_type='lstm',
num_layers=1,
num_hidden_layers=2,
bias=True,
batch_first=True,
dropout=0,
bidirectional=False,
nr_cells=5,
read_heads=2,
cell_size=10,
nonlinearity='tanh',
gpu_id=-1,
independent_linears=False,
share_memory=True,
debug=False,
clip=20
):
super(DNC, self).__init__()
# todo: separate weights and RNNs for the interface and output vectors
self.input_size = input_size
self.hidden_size = hidden_size
self.rnn_type = rnn_type
self.num_layers = num_layers
self.num_hidden_layers = num_hidden_layers
self.bias = bias
self.batch_first = batch_first
self.dropout = dropout
self.bidirectional = bidirectional
self.nr_cells = nr_cells
self.read_heads = read_heads
self.cell_size = cell_size
self.nonlinearity = nonlinearity
self.gpu_id = gpu_id
self.independent_linears = independent_linears
self.share_memory = share_memory
self.debug = debug
self.clip = clip
self.w = self.cell_size
self.r = self.read_heads
self.read_vectors_size = self.r * self.w
self.output_size = self.hidden_size
self.nn_input_size = self.input_size + self.read_vectors_size
self.nn_output_size = self.output_size + self.read_vectors_size
self.rnns = []
self.memories = []
for layer in range(self.num_layers):
if self.rnn_type.lower() == 'rnn':
self.rnns.append(nn.RNN((self.nn_input_size if layer == 0 else self.nn_output_size), self.output_size,
bias=self.bias, nonlinearity=self.nonlinearity, batch_first=True, dropout=self.dropout, num_layers=self.num_hidden_layers))
elif self.rnn_type.lower() == 'gru':
self.rnns.append(nn.GRU((self.nn_input_size if layer == 0 else self.nn_output_size),
self.output_size, bias=self.bias, batch_first=True, dropout=self.dropout, num_layers=self.num_hidden_layers))
if self.rnn_type.lower() == 'lstm':
self.rnns.append(nn.LSTM((self.nn_input_size if layer == 0 else self.nn_output_size),
self.output_size, bias=self.bias, batch_first=True, dropout=self.dropout, num_layers=self.num_hidden_layers))
setattr(self, self.rnn_type.lower() + '_layer_' + str(layer), self.rnns[layer])
# memories for each layer
if not self.share_memory:
self.memories.append(
Memory(
input_size=self.output_size,
mem_size=self.nr_cells,
cell_size=self.w,
read_heads=self.r,
gpu_id=self.gpu_id,
independent_linears=self.independent_linears
)
)
setattr(self, 'rnn_layer_memory_' + str(layer), self.memories[layer])
# only one memory shared by all layers
if self.share_memory:
self.memories.append(
Memory(
input_size=self.output_size,
mem_size=self.nr_cells,
cell_size=self.w,
read_heads=self.r,
gpu_id=self.gpu_id,
independent_linears=self.independent_linears
)
)
setattr(self, 'rnn_layer_memory_shared', self.memories[0])
# final output layer
self.output = nn.Linear(self.nn_output_size, self.output_size)
orthogonal_(self.output.weight)
if self.gpu_id != -1:
[x.cuda(self.gpu_id) for x in self.rnns]
[x.cuda(self.gpu_id) for x in self.memories]
self.output.cuda()
def _init_hidden(self, hx, batch_size, reset_experience):
# create empty hidden states if not provided
if hx is None:
hx = (None, None, None)
(chx, mhx, last_read) = hx
# initialize hidden state of the controller RNN
if chx is None:
h = cuda(T.zeros(self.num_hidden_layers, batch_size, self.output_size), gpu_id=self.gpu_id)
xavier_uniform_(h)
chx = [ (h, h) if self.rnn_type.lower() == 'lstm' else h for x in range(self.num_layers)]
# Last read vectors
if last_read is None:
last_read = cuda(T.zeros(batch_size, self.w * self.r), gpu_id=self.gpu_id)
# memory states
if mhx is None:
if self.share_memory:
mhx = self.memories[0].reset(batch_size, erase=reset_experience)
else:
mhx = [m.reset(batch_size, erase=reset_experience) for m in self.memories]
else:
if self.share_memory:
mhx = self.memories[0].reset(batch_size, mhx, erase=reset_experience)
else:
mhx = [m.reset(batch_size, h, erase=reset_experience) for m, h in zip(self.memories, mhx)]
return chx, mhx, last_read
def _debug(self, mhx, debug_obj):
if not debug_obj:
debug_obj = {
'memory': [],
'link_matrix': [],
'precedence': [],
'read_weights': [],
'write_weights': [],
'usage_vector': [],
}
debug_obj['memory'].append(mhx['memory'][0].data.cpu().numpy())
debug_obj['link_matrix'].append(mhx['link_matrix'][0][0].data.cpu().numpy())
debug_obj['precedence'].append(mhx['precedence'][0].data.cpu().numpy())
debug_obj['read_weights'].append(mhx['read_weights'][0].data.cpu().numpy())
debug_obj['write_weights'].append(mhx['write_weights'][0].data.cpu().numpy())
debug_obj['usage_vector'].append(mhx['usage_vector'][0].unsqueeze(0).data.cpu().numpy())
return debug_obj
def _layer_forward(self, input, layer, hx=(None, None), pass_through_memory=True):
(chx, mhx) = hx
# pass through the controller layer
input, chx = self.rnns[layer](input.unsqueeze(1), chx)
input = input.squeeze(1)
# clip the controller output
if self.clip != 0:
output = T.clamp(input, -self.clip, self.clip)
else:
output = input
# the interface vector
ξ = output
# pass through memory
if pass_through_memory:
if self.share_memory:
read_vecs, mhx = self.memories[0](ξ, mhx)
else:
read_vecs, mhx = self.memories[layer](ξ, mhx)
# the read vectors
read_vectors = read_vecs.view(-1, self.w * self.r)
else:
read_vectors = None
return output, (chx, mhx, read_vectors)
def forward(self, input, hx=(None, None, None), reset_experience=False, pass_through_memory=True):
# handle packed data
is_packed = type(input) is PackedSequence
if is_packed:
input, lengths = pad(input)
max_length = lengths[0]
else:
max_length = input.size(1) if self.batch_first else input.size(0)
lengths = [input.size(1)] * max_length if self.batch_first else [input.size(0)] * max_length
batch_size = input.size(0) if self.batch_first else input.size(1)
if not self.batch_first:
input = input.transpose(0, 1)
# make the data time-first
controller_hidden, mem_hidden, last_read = self._init_hidden(hx, batch_size, reset_experience)
# concat input with last read (or padding) vectors
inputs = [T.cat([input[:, x, :], last_read], 1) for x in range(max_length)]
# batched forward pass per element / word / etc
if self.debug:
viz = None
outs = [None] * max_length
read_vectors = None
rv = [None] * max_length
# pass through time
for time in range(max_length):
# pass thorugh layers
for layer in range(self.num_layers):
# this layer's hidden states
chx = controller_hidden[layer]
m = mem_hidden if self.share_memory else mem_hidden[layer]
# pass through controller
outs[time], (chx, m, read_vectors) = \
self._layer_forward(inputs[time], layer, (chx, m), pass_through_memory)
# debug memory
if self.debug:
viz = self._debug(m, viz)
# store the memory back (per layer or shared)
if self.share_memory:
mem_hidden = m
else:
mem_hidden[layer] = m
controller_hidden[layer] = chx
if read_vectors is not None:
# the controller output + read vectors go into next layer
outs[time] = T.cat([outs[time], read_vectors], 1)
if layer == self.num_layers - 1:
rv[time] = read_vectors.reshape(batch_size, self.r, self.w)
else:
outs[time] = T.cat([outs[time], last_read], 1)
inputs[time] = outs[time]
if self.debug:
viz = {k: np.array(v) for k, v in viz.items()}
viz = {k: v.reshape(v.shape[0], v.shape[1] * v.shape[2]) for k, v in viz.items()}
# pass through final output layer
inputs = [self.output(i) for i in inputs]
outputs = T.stack(inputs, 1 if self.batch_first else 0)
if is_packed:
outputs = pack(output, lengths)
if self.debug:
return outputs, (controller_hidden, mem_hidden, read_vectors), rv, viz
else:
return outputs, (controller_hidden, mem_hidden, read_vectors), rv
def __repr__(self):
s = "\n----------------------------------------\n"
s += '{name}({input_size}, {hidden_size}'
if self.rnn_type != 'lstm':
s += ', rnn_type={rnn_type}'
if self.num_layers != 1:
s += ', num_layers={num_layers}'
if self.num_hidden_layers != 2:
s += ', num_hidden_layers={num_hidden_layers}'
if self.bias != True:
s += ', bias={bias}'
if self.batch_first != True:
s += ', batch_first={batch_first}'
if self.dropout != 0:
s += ', dropout={dropout}'
if self.bidirectional != False:
s += ', bidirectional={bidirectional}'
if self.nr_cells != 5:
s += ', nr_cells={nr_cells}'
if self.read_heads != 2:
s += ', read_heads={read_heads}'
if self.cell_size != 10:
s += ', cell_size={cell_size}'
if self.nonlinearity != 'tanh':
s += ', nonlinearity={nonlinearity}'
if self.gpu_id != -1:
s += ', gpu_id={gpu_id}'
if self.independent_linears != False:
s += ', independent_linears={independent_linears}'
if self.share_memory != True:
s += ', share_memory={share_memory}'
if self.debug != False:
s += ', debug={debug}'
if self.clip != 20:
s += ', clip={clip}'
s += ")\n" + super(DNC, self).__repr__() + \
"\n----------------------------------------\n"
return s.format(name=self.__class__.__name__, **self.__dict__)

208
code/core/model/mca.py Normal file
View file

@ -0,0 +1,208 @@
# --------------------------------------------------------
# mcan-vqa (Deep Modular Co-Attention Networks)
# Licensed under The MIT License [see LICENSE for details]
# Written by Yuhao Cui https://github.com/cuiyuhao1996
# --------------------------------------------------------
from core.model.net_utils import FC, MLP, LayerNorm
from core.model.dnc_improved import DNC, SharedMemDNC
from core.model.dnc_improved import FeedforwardController
import torch.nn as nn
import torch.nn.functional as F
import torch, math
import time
# ------------------------------
# ---- Multi-Head Attention ----
# ------------------------------
class MHAtt(nn.Module):
def __init__(self, __C):
super(MHAtt, self).__init__()
self.__C = __C
self.linear_v = nn.Linear(__C.HIDDEN_SIZE, __C.HIDDEN_SIZE)
self.linear_k = nn.Linear(__C.HIDDEN_SIZE, __C.HIDDEN_SIZE)
self.linear_q = nn.Linear(__C.HIDDEN_SIZE, __C.HIDDEN_SIZE)
self.linear_merge = nn.Linear(__C.HIDDEN_SIZE, __C.HIDDEN_SIZE)
self.dropout = nn.Dropout(__C.DROPOUT_R)
def forward(self, v, k, q, mask):
n_batches = q.size(0)
v = self.linear_v(v).view(
n_batches,
-1,
self.__C.MULTI_HEAD,
self.__C.HIDDEN_SIZE_HEAD
).transpose(1, 2)
k = self.linear_k(k).view(
n_batches,
-1,
self.__C.MULTI_HEAD,
self.__C.HIDDEN_SIZE_HEAD
).transpose(1, 2)
q = self.linear_q(q).view(
n_batches,
-1,
self.__C.MULTI_HEAD,
self.__C.HIDDEN_SIZE_HEAD
).transpose(1, 2)
atted = self.att(v, k, q, mask)
atted = atted.transpose(1, 2).contiguous().view(
n_batches,
-1,
self.__C.HIDDEN_SIZE
)
atted = self.linear_merge(atted)
return atted
def att(self, value, key, query, mask):
d_k = query.size(-1)
scores = torch.matmul(
query, key.transpose(-2, -1)
) / math.sqrt(d_k)
if mask is not None:
scores = scores.masked_fill(mask, -1e9)
att_map = F.softmax(scores, dim=-1)
att_map = self.dropout(att_map)
return torch.matmul(att_map, value)
# ---------------------------
# ---- Feed Forward Nets ----
# ---------------------------
class FFN(nn.Module):
def __init__(self, __C):
super(FFN, self).__init__()
self.mlp = MLP(
in_size=__C.HIDDEN_SIZE,
mid_size=__C.FF_SIZE,
out_size=__C.HIDDEN_SIZE,
dropout_r=__C.DROPOUT_R,
use_relu=True
)
def forward(self, x):
return self.mlp(x)
# ------------------------
# ---- Self Attention ----
# ------------------------
class SA(nn.Module):
def __init__(self, __C):
super(SA, self).__init__()
self.mhatt = MHAtt(__C)
self.ffn = FFN(__C)
self.dropout1 = nn.Dropout(__C.DROPOUT_R)
self.norm1 = LayerNorm(__C.HIDDEN_SIZE)
self.dropout2 = nn.Dropout(__C.DROPOUT_R)
self.norm2 = LayerNorm(__C.HIDDEN_SIZE)
def forward(self, x, x_mask):
x = self.norm1(x + self.dropout1(
self.mhatt(x, x, x, x_mask)
))
x = self.norm2(x + self.dropout2(
self.ffn(x)
))
return x
# -------------------------------
# ---- Self Guided Attention ----
# -------------------------------
class SGA(nn.Module):
def __init__(self, __C):
super(SGA, self).__init__()
self.mhatt1 = MHAtt(__C)
self.mhatt2 = MHAtt(__C)
self.ffn = FFN(__C)
self.dropout1 = nn.Dropout(__C.DROPOUT_R)
self.norm1 = LayerNorm(__C.HIDDEN_SIZE)
self.dropout2 = nn.Dropout(__C.DROPOUT_R)
self.norm2 = LayerNorm(__C.HIDDEN_SIZE)
self.dropout3 = nn.Dropout(__C.DROPOUT_R)
self.norm3 = LayerNorm(__C.HIDDEN_SIZE)
def forward(self, x, y, x_mask, y_mask):
x = self.norm1(x + self.dropout1(
self.mhatt1(x, x, x, x_mask)
))
x = self.norm2(x + self.dropout2(
self.mhatt2(y, y, x, y_mask)
))
x = self.norm3(x + self.dropout3(
self.ffn(x)
))
return x
# ------------------------------------------------
# ---- MAC Layers Cascaded by Encoder-Decoder ----
# ------------------------------------------------
class MCA_ED(nn.Module):
def __init__(self, __C):
super(MCA_ED, self).__init__()
self.enc_list = nn.ModuleList([SA(__C) for _ in range(__C.LAYER)])
self.dec_list = nn.ModuleList([SGA(__C) for _ in range(__C.LAYER)])
def forward(self, x, y, x_mask, y_mask):
# Get hidden vector
for enc in self.enc_list:
x = enc(x, x_mask)
for dec in self.dec_list:
y = dec(y, x, y_mask, x_mask)
return x, y
class VLC(nn.Module):
def __init__(self, __C):
super(VLC, self).__init__()
self.enc_list = nn.ModuleList([SA(__C) for _ in range(__C.LAYER)])
self.dec_lang_frames_list = nn.ModuleList([SGA(__C) for _ in range(__C.LAYER)])
self.dec_lang_clips_list = nn.ModuleList([SGA(__C) for _ in range(__C.LAYER)])
def forward(self, x, y, z, x_mask, y_mask, z_mask):
# Get hidden vector
for enc in self.enc_list:
x = enc(x, x_mask)
for dec in self.dec_lang_frames_list:
y = dec(y, x, y_mask, x_mask)
for dec in self.dec_lang_clips_list:
z = dec(z, x, z_mask, x_mask)
return x, y, z

314
code/core/model/memory.py Normal file
View file

@ -0,0 +1,314 @@
"""
PyTorch DNC implementation from
-->
https://github.com/ixaxaar/pytorch-dnc
<--
"""
# -*- coding: utf-8 -*-
import torch.nn as nn
import torch as T
from torch.autograd import Variable as var
import torch.nn.functional as F
import numpy as np
from core.model.util import *
class Memory(nn.Module):
def __init__(self, input_size, mem_size=512, cell_size=32, read_heads=4, gpu_id=-1, independent_linears=True):
super(Memory, self).__init__()
self.input_size = input_size
self.mem_size = mem_size
self.cell_size = cell_size
self.read_heads = read_heads
self.gpu_id = gpu_id
self.independent_linears = independent_linears
m = self.mem_size
w = self.cell_size
r = self.read_heads
if self.independent_linears:
self.read_keys_transform = nn.Linear(self.input_size, w * r)
self.read_strengths_transform = nn.Linear(self.input_size, r)
self.write_key_transform = nn.Linear(self.input_size, w)
self.write_strength_transform = nn.Linear(self.input_size, 1)
self.erase_vector_transform = nn.Linear(self.input_size, w)
self.write_vector_transform = nn.Linear(self.input_size, w)
self.free_gates_transform = nn.Linear(self.input_size, r)
self.allocation_gate_transform = nn.Linear(self.input_size, 1)
self.write_gate_transform = nn.Linear(self.input_size, 1)
self.read_modes_transform = nn.Linear(self.input_size, 3 * r)
else:
self.interface_size = (w * r) + (3 * w) + (5 * r) + 3
self.interface_weights = nn.Linear(
self.input_size, self.interface_size)
self.I = cuda(1 - T.eye(m).unsqueeze(0),
gpu_id=self.gpu_id) # (1 * n * n)
def reset(self, batch_size=1, hidden=None, erase=True):
m = self.mem_size
w = self.cell_size
r = self.read_heads
b = batch_size
if hidden is None:
return {
'memory': cuda(T.zeros(b, m, w).fill_(0), gpu_id=self.gpu_id),
'link_matrix': cuda(T.zeros(b, 1, m, m), gpu_id=self.gpu_id),
'precedence': cuda(T.zeros(b, 1, m), gpu_id=self.gpu_id),
'read_weights': cuda(T.zeros(b, r, m).fill_(0), gpu_id=self.gpu_id),
'write_weights': cuda(T.zeros(b, 1, m).fill_(0), gpu_id=self.gpu_id),
'usage_vector': cuda(T.zeros(b, m), gpu_id=self.gpu_id),
# 'free_gates': cuda(T.zeros(b, r), gpu_id=self.gpu_id),
# 'alloc_gates': cuda(T.zeros(b, 1), gpu_id=self.gpu_id),
# 'write_gates': cuda(T.zeros(b, 1), gpu_id=self.gpu_id),
# 'read_modes': cuda(T.zeros(b, r, 3), gpu_id=self.gpu_id)
}
else:
hidden['memory'] = hidden['memory'].clone()
hidden['link_matrix'] = hidden['link_matrix'].clone()
hidden['precedence'] = hidden['precedence'].clone()
hidden['read_weights'] = hidden['read_weights'].clone()
hidden['write_weights'] = hidden['write_weights'].clone()
hidden['usage_vector'] = hidden['usage_vector'].clone()
# hidden['free_gates'] = hidden['free_gates'].clone()
# hidden['alloc_gates'] = hidden['alloc_gates'].clone()
# hidden['write_gates'] = hidden['write_gates'].clone()
# hidden['read_modes'] = hidden['read_modes'].clone()
if erase:
hidden['memory'].data.fill_(0)
hidden['link_matrix'].data.zero_()
hidden['precedence'].data.zero_()
hidden['read_weights'].data.fill_(0)
hidden['write_weights'].data.fill_(0)
hidden['usage_vector'].data.zero_()
# hidden['free_gates'].data.fill_()
# hidden['alloc_gates'].data.fill_()
# hidden['write_gates'].data.fill_()
# hidden['read_modes'].data.fill_()
return hidden
def get_usage_vector(self, usage, free_gates, read_weights, write_weights):
# write_weights = write_weights.detach() # detach from the computation graph
# if read_weights.size(0) > free_gates.size(0):
# read_weights = read_weights[:free_gates.size(0), :, :]
# if usage.size(0) > free_gates.size(0):
# usage = usage[:free_gates.size(0), :]
# if write_weights.size(0) > free_gates.size(0):
# write_weights = write_weights[:free_gates.size(0), :, :]
usage = usage + (1 - usage) * (1 - T.prod(1 - write_weights, 1))
ψ = T.prod(1 - free_gates.unsqueeze(2) * read_weights, 1)
return usage * ψ
def allocate(self, usage, write_gate):
# ensure values are not too small prior to cumprod.
usage = δ + (1 - δ) * usage
batch_size = usage.size(0)
# free list
sorted_usage, φ = T.topk(usage, self.mem_size, dim=1, largest=False)
# cumprod with exclusive=True
# https://discuss.pytorch.org/t/cumprod-exclusive-true-equivalences/2614/8
v = var(sorted_usage.data.new(batch_size, 1).fill_(1))
cat_sorted_usage = T.cat((v, sorted_usage), 1)
prod_sorted_usage = T.cumprod(cat_sorted_usage, 1)[:, :-1]
sorted_allocation_weights = (1 - sorted_usage) * prod_sorted_usage.squeeze()
# construct the reverse sorting index https://stackoverflow.com/questions/2483696/undo-or-reverse-argsort-python
_, φ_rev = T.topk(φ, k=self.mem_size, dim=1, largest=False)
allocation_weights = sorted_allocation_weights.gather(1, φ_rev.long())
return allocation_weights.unsqueeze(1), usage
def write_weighting(self, memory, write_content_weights, allocation_weights, write_gate, allocation_gate):
ag = allocation_gate.unsqueeze(-1)
wg = write_gate.unsqueeze(-1)
return wg * (ag * allocation_weights + (1 - ag) * write_content_weights)
def get_link_matrix(self, link_matrix, write_weights, precedence):
precedence = precedence.unsqueeze(2)
write_weights_i = write_weights.unsqueeze(3)
write_weights_j = write_weights.unsqueeze(2)
prev_scale = 1 - write_weights_i - write_weights_j
new_link_matrix = write_weights_i * precedence
link_matrix = prev_scale * link_matrix + new_link_matrix
# trick to delete diag elems
return self.I.expand_as(link_matrix) * link_matrix
def update_precedence(self, precedence, write_weights):
return (1 - T.sum(write_weights, 2, keepdim=True)) * precedence + write_weights
def write(self, write_key, write_vector, erase_vector, free_gates, read_strengths, write_strength, write_gate, allocation_gate, hidden):
# get current usage
hidden['usage_vector'] = self.get_usage_vector(
hidden['usage_vector'],
free_gates,
hidden['read_weights'],
hidden['write_weights']
)
# lookup memory with write_key and write_strength
write_content_weights = self.content_weightings(
hidden['memory'], write_key, write_strength)
# get memory allocation
alloc, _ = self.allocate(
hidden['usage_vector'],
allocation_gate * write_gate
)
# get write weightings
hidden['write_weights'] = self.write_weighting(
hidden['memory'],
write_content_weights,
alloc,
write_gate,
allocation_gate
)
weighted_resets = hidden['write_weights'].unsqueeze(
3) * erase_vector.unsqueeze(2)
reset_gate = T.prod(1 - weighted_resets, 1)
# Update memory
hidden['memory'] = hidden['memory'] * reset_gate
hidden['memory'] = hidden['memory'] + \
T.bmm(hidden['write_weights'].transpose(1, 2), write_vector)
# update link_matrix
hidden['link_matrix'] = self.get_link_matrix(
hidden['link_matrix'],
hidden['write_weights'],
hidden['precedence']
)
hidden['precedence'] = self.update_precedence(
hidden['precedence'], hidden['write_weights'])
return hidden
def content_weightings(self, memory, keys, strengths):
# if memory.size(0) > keys.size(0):
# memory = memory[:keys.size(0), :, :]
d = θ(memory, keys)
return σ(d * strengths.unsqueeze(2), 2)
def directional_weightings(self, link_matrix, read_weights):
rw = read_weights.unsqueeze(1)
f = T.matmul(link_matrix, rw.transpose(2, 3)).transpose(2, 3)
b = T.matmul(rw, link_matrix)
return f.transpose(1, 2), b.transpose(1, 2)
def read_weightings(self, memory, content_weights, link_matrix, read_modes, read_weights):
forward_weight, backward_weight = self.directional_weightings(
link_matrix, read_weights)
content_mode = read_modes[:, :, 2].contiguous(
).unsqueeze(2) * content_weights
backward_mode = T.sum(
read_modes[:, :, 0:1].contiguous().unsqueeze(3) * backward_weight, 2)
forward_mode = T.sum(
read_modes[:, :, 1:2].contiguous().unsqueeze(3) * forward_weight, 2)
return backward_mode + content_mode + forward_mode
def read_vectors(self, memory, read_weights):
return T.bmm(read_weights, memory)
def read(self, read_keys, read_strengths, read_modes, hidden):
content_weights = self.content_weightings(
hidden['memory'], read_keys, read_strengths)
hidden['read_weights'] = self.read_weightings(
hidden['memory'],
content_weights,
hidden['link_matrix'],
read_modes,
hidden['read_weights']
)
read_vectors = self.read_vectors(
hidden['memory'], hidden['read_weights'])
return read_vectors, hidden
def forward(self, ξ, hidden):
# ξ = ξ.detach()
m = self.mem_size
w = self.cell_size
r = self.read_heads
b = ξ.size()[0]
if self.independent_linears:
# r read keys (b * r * w)
read_keys = self.read_keys_transform(ξ).view(b, r, w)
# r read strengths (b * r)
read_strengths = F.softplus(
self.read_strengths_transform(ξ).view(b, r))
# write key (b * 1 * w)
write_key = self.write_key_transform(ξ).view(b, 1, w)
# write strength (b * 1)
write_strength = F.softplus(
self.write_strength_transform(ξ).view(b, 1))
# erase vector (b * 1 * w)
erase_vector = T.sigmoid(
self.erase_vector_transform(ξ).view(b, 1, w))
# write vector (b * 1 * w)
write_vector = self.write_vector_transform(ξ).view(b, 1, w)
# r free gates (b * r)
free_gates = T.sigmoid(self.free_gates_transform(ξ).view(b, r))
# allocation gate (b * 1)
allocation_gate = T.sigmoid(
self.allocation_gate_transform(ξ).view(b, 1))
# write gate (b * 1)
write_gate = T.sigmoid(self.write_gate_transform(ξ).view(b, 1))
# read modes (b * r * 3)
read_modes = σ(self.read_modes_transform(ξ).view(b, r, 3), -1)
else:
ξ = self.interface_weights(ξ)
# r read keys (b * w * r)
read_keys = ξ[:, :r * w].contiguous().view(b, r, w)
# r read strengths (b * r)
read_strengths = F.softplus(
ξ[:, r * w:r * w + r].contiguous().view(b, r))
# write key (b * w * 1)
write_key = ξ[:, r * w + r:r * w + r + w].contiguous().view(b, 1, w)
# write strength (b * 1)
write_strength = F.softplus(
ξ[:, r * w + r + w].contiguous().view(b, 1))
# erase vector (b * w)
erase_vector = T.sigmoid(
ξ[:, r * w + r + w + 1: r * w + r + 2 * w + 1].contiguous().view(b, 1, w))
# write vector (b * w)
write_vector = ξ[:, r * w + r + 2 * w + 1: r * w + r + 3 * w + 1].contiguous().view(b, 1, w)
# r free gates (b * r)
free_gates = T.sigmoid(
ξ[:, r * w + r + 3 * w + 1: r * w + 2 * r + 3 * w + 1].contiguous().view(b, r))
# allocation gate (b * 1)
allocation_gate = T.sigmoid(
ξ[:, r * w + 2 * r + 3 * w + 1].contiguous().unsqueeze(1).view(b, 1))
# write gate (b * 1)
write_gate = T.sigmoid(
ξ[:, r * w + 2 * r + 3 * w + 2].contiguous()).unsqueeze(1).view(b, 1)
# read modes (b * 3*r)
read_modes = σ(ξ[:, r * w + 2 * r + 3 * w + 3: r *
w + 5 * r + 3 * w + 3].contiguous().view(b, r, 3), -1)
hidden = self.write(write_key, write_vector, erase_vector, free_gates,
read_strengths, write_strength, write_gate, allocation_gate, hidden)
hidden["free_gates"] = free_gates.clone().detach()
hidden["allocation_gate"] = allocation_gate.clone().detach()
hidden["write_gate"] = write_gate.clone().detach()
hidden["read_modes"] = read_modes.clone().detach()
return self.read(read_keys, read_strengths, read_modes, hidden)

501
code/core/model/net.py Normal file
View file

@ -0,0 +1,501 @@
# --------------------------------------------------------
# mcan-vqa (Deep Modular Co-Attention Networks)
# Licensed under The MIT License [see LICENSE for details]
# Written by Yuhao Cui https://github.com/cuiyuhao1996
# --------------------------------------------------------
from core.model.net_utils import FC, MLP, LayerNorm
from core.model.mca import SA, MCA_ED, VLC
from core.model.dnc import DNC
import torch.nn as nn
import torch.nn.functional as F
import torch
# ------------------------------
# ---- Flatten the sequence ----
# ------------------------------
class AttFlat(nn.Module):
def __init__(self, __C):
super(AttFlat, self).__init__()
self.__C = __C
self.mlp = MLP(
in_size=__C.HIDDEN_SIZE,
mid_size=__C.FLAT_MLP_SIZE,
out_size=__C.FLAT_GLIMPSES,
dropout_r=__C.DROPOUT_R,
use_relu=True
)
self.linear_merge = nn.Linear(
__C.HIDDEN_SIZE * __C.FLAT_GLIMPSES,
__C.FLAT_OUT_SIZE
)
def forward(self, x, x_mask):
att = self.mlp(x)
att = att.masked_fill(
x_mask.squeeze(1).squeeze(1).unsqueeze(2),
-1e9
)
att = F.softmax(att, dim=1)
att_list = []
for i in range(self.__C.FLAT_GLIMPSES):
att_list.append(
torch.sum(att[:, :, i: i + 1] * x, dim=1)
)
x_atted = torch.cat(att_list, dim=1)
x_atted = self.linear_merge(x_atted)
return x_atted
class AttFlatMem(AttFlat):
def __init__(self, __C):
super(AttFlatMem, self).__init__(__C)
self.__C = __C
def forward(self, x_mem, x, x_mask):
att = self.mlp(x_mem)
att = att.masked_fill(
x_mask.squeeze(1).squeeze(1).unsqueeze(2),
float('-inf')
)
att = F.softmax(att, dim=1)
att_list = []
for i in range(self.__C.FLAT_GLIMPSES):
att_list.append(
torch.sum(att[:, :, i: i + 1] * x, dim=1)
)
x_atted = torch.cat(att_list, dim=1)
x_atted = self.linear_merge(x_atted)
return x_atted
# -------------------------
# ---- Main MCAN Model ----
# -------------------------
class Net1(nn.Module):
def __init__(self, __C, pretrained_emb, token_size, answer_size):
super(Net1, self).__init__()
print('Training with Network type 1: VLCN')
self.pretrained_path = __C.PRETRAINED_PATH
self.embedding = nn.Embedding(
num_embeddings=token_size,
embedding_dim=__C.WORD_EMBED_SIZE
)
# Loading the GloVe embedding weights
if __C.USE_GLOVE:
self.embedding.weight.data.copy_(torch.from_numpy(pretrained_emb))
self.lstm = nn.LSTM(
input_size=__C.WORD_EMBED_SIZE,
hidden_size=__C.HIDDEN_SIZE,
num_layers=1,
batch_first=True
)
self.frame_feat_linear = nn.Linear(
__C.FRAME_FEAT_SIZE,
__C.HIDDEN_SIZE
)
self.clip_feat_linear = nn.Linear(
__C.CLIP_FEAT_SIZE,
__C.HIDDEN_SIZE
)
self.backbone = VLC(__C)
self.attflat_lang = AttFlat(__C)
self.attflat_frame = AttFlat(__C)
self.attflat_clip = AttFlat(__C)
self.dnc = DNC(
__C.FLAT_OUT_SIZE,
__C.FLAT_OUT_SIZE,
rnn_type='lstm',
num_layers=2,
num_hidden_layers=2,
bias=True,
batch_first=True,
dropout=0,
bidirectional=True,
nr_cells=__C.CELL_COUNT_DNC,
read_heads=__C.N_READ_HEADS_DNC,
cell_size=__C.WORD_LENGTH_DNC,
nonlinearity='tanh',
gpu_id=0,
independent_linears=False,
share_memory=False,
debug=False,
clip=20,
)
self.proj_norm = LayerNorm(__C.FLAT_OUT_SIZE)
self.proj_norm_dnc = LayerNorm(__C.FLAT_OUT_SIZE + __C.N_READ_HEADS_DNC * __C.WORD_LENGTH_DNC)
self.linear_dnc = FC(__C.FLAT_OUT_SIZE + __C.N_READ_HEADS_DNC * __C.WORD_LENGTH_DNC, __C.FLAT_OUT_SIZE, dropout_r=0.2)
self.proj = nn.Linear(__C.FLAT_OUT_SIZE, answer_size)
def forward(self, frame_feat, clip_feat, ques_ix):
# Make mask
lang_feat_mask = self.make_mask(ques_ix.unsqueeze(2))
frame_feat_mask = self.make_mask(frame_feat)
clip_feat_mask = self.make_mask(clip_feat)
# Pre-process Language Feature
lang_feat = self.embedding(ques_ix)
lang_feat, _ = self.lstm(lang_feat)
# Pre-process Video Feature
frame_feat = self.frame_feat_linear(frame_feat)
clip_feat = self.clip_feat_linear(clip_feat)
# Backbone Framework
lang_feat, frame_feat, clip_feat = self.backbone(
lang_feat,
frame_feat,
clip_feat,
lang_feat_mask,
frame_feat_mask,
clip_feat_mask
)
lang_feat = self.attflat_lang(
lang_feat,
lang_feat_mask
)
frame_feat = self.attflat_frame(
frame_feat,
frame_feat_mask
)
clip_feat = self.attflat_clip(
clip_feat,
clip_feat_mask
)
proj_feat_0 = lang_feat + frame_feat + clip_feat
proj_feat_0 = self.proj_norm(proj_feat_0)
proj_feat_1 = torch.stack([lang_feat, frame_feat, clip_feat], dim=1)
proj_feat_1, (_, _, rv), _ = self.dnc(proj_feat_1, (None, None, None), reset_experience=True, pass_through_memory=True)
proj_feat_1 = proj_feat_1.sum(1)
proj_feat_1 = torch.cat([proj_feat_1, rv], dim=-1)
proj_feat_1 = self.proj_norm_dnc(proj_feat_1)
proj_feat_1 = self.linear_dnc(proj_feat_1)
# proj_feat_1 = self.proj_norm(proj_feat_1)
proj_feat = torch.sigmoid(self.proj(proj_feat_0 + proj_feat_1))
return proj_feat
def load_pretrained_weights(self):
pretrained_msvd = torch.load(self.pretrained_path)['state_dict']
for n_pretrained, p_pretrained in pretrained_msvd.items():
if 'dnc' in n_pretrained:
self.state_dict()[n_pretrained].copy_(p_pretrained)
print('Pre-trained dnc-weights successfully loaded!')
# Masking
def make_mask(self, feature):
return (torch.sum(
torch.abs(feature),
dim=-1
) == 0).unsqueeze(1).unsqueeze(2)
class Net2(nn.Module):
def __init__(self, __C, pretrained_emb, token_size, answer_size):
super(Net2, self).__init__()
print('Training with Network type 2: VLCN-FLF')
self.embedding = nn.Embedding(
num_embeddings=token_size,
embedding_dim=__C.WORD_EMBED_SIZE
)
# Loading the GloVe embedding weights
if __C.USE_GLOVE:
self.embedding.weight.data.copy_(torch.from_numpy(pretrained_emb))
self.lstm = nn.LSTM(
input_size=__C.WORD_EMBED_SIZE,
hidden_size=__C.HIDDEN_SIZE,
num_layers=1,
batch_first=True
)
self.frame_feat_linear = nn.Linear(
__C.FRAME_FEAT_SIZE,
__C.HIDDEN_SIZE
)
self.clip_feat_linear = nn.Linear(
__C.CLIP_FEAT_SIZE,
__C.HIDDEN_SIZE
)
self.backbone = VLC(__C)
self.attflat_lang = AttFlat(__C)
self.attflat_frame = AttFlat(__C)
self.attflat_clip = AttFlat(__C)
self.proj_norm = LayerNorm(__C.FLAT_OUT_SIZE)
self.proj = nn.Linear(__C.FLAT_OUT_SIZE, answer_size)
def forward(self, frame_feat, clip_feat, ques_ix):
# Make mask
lang_feat_mask = self.make_mask(ques_ix.unsqueeze(2))
frame_feat_mask = self.make_mask(frame_feat)
clip_feat_mask = self.make_mask(clip_feat)
# Pre-process Language Feature
lang_feat = self.embedding(ques_ix)
lang_feat, _ = self.lstm(lang_feat)
# Pre-process Video Feature
frame_feat = self.frame_feat_linear(frame_feat)
clip_feat = self.clip_feat_linear(clip_feat)
# Backbone Framework
lang_feat, frame_feat, clip_feat = self.backbone(
lang_feat,
frame_feat,
clip_feat,
lang_feat_mask,
frame_feat_mask,
clip_feat_mask
)
lang_feat = self.attflat_lang(
lang_feat,
lang_feat_mask
)
frame_feat = self.attflat_frame(
frame_feat,
frame_feat_mask
)
clip_feat = self.attflat_clip(
clip_feat,
clip_feat_mask
)
proj_feat = lang_feat + frame_feat + clip_feat
proj_feat = self.proj_norm(proj_feat)
proj_feat = torch.sigmoid(self.proj(proj_feat))
return proj_feat
# Masking
def make_mask(self, feature):
return (torch.sum(
torch.abs(feature),
dim=-1
) == 0).unsqueeze(1).unsqueeze(2)
class Net3(nn.Module):
def __init__(self, __C, pretrained_emb, token_size, answer_size):
super(Net3, self).__init__()
print('Training with Network type 3: VLCN+LSTM')
self.embedding = nn.Embedding(
num_embeddings=token_size,
embedding_dim=__C.WORD_EMBED_SIZE
)
# Loading the GloVe embedding weights
if __C.USE_GLOVE:
self.embedding.weight.data.copy_(torch.from_numpy(pretrained_emb))
self.lstm = nn.LSTM(
input_size=__C.WORD_EMBED_SIZE,
hidden_size=__C.HIDDEN_SIZE,
num_layers=1,
batch_first=True
)
self.frame_feat_linear = nn.Linear(
__C.FRAME_FEAT_SIZE,
__C.HIDDEN_SIZE
)
self.clip_feat_linear = nn.Linear(
__C.CLIP_FEAT_SIZE,
__C.HIDDEN_SIZE
)
self.backbone = VLC(__C)
self.attflat_lang = AttFlat(__C)
self.attflat_frame = AttFlat(__C)
self.attflat_clip = AttFlat(__C)
self.lstm_fusion = nn.LSTM(
input_size=__C.FLAT_OUT_SIZE,
hidden_size=__C.FLAT_OUT_SIZE,
num_layers=2,
batch_first=True,
bidirectional=True
)
self.proj_norm = LayerNorm(__C.FLAT_OUT_SIZE)
self.proj_feat_1 = nn.Linear(__C.FLAT_OUT_SIZE * 2, __C.FLAT_OUT_SIZE)
self.proj_norm_lstm = LayerNorm(__C.FLAT_OUT_SIZE)
self.proj = nn.Linear(__C.FLAT_OUT_SIZE, answer_size)
def forward(self, frame_feat, clip_feat, ques_ix):
# Make mask
lang_feat_mask = self.make_mask(ques_ix.unsqueeze(2))
frame_feat_mask = self.make_mask(frame_feat)
clip_feat_mask = self.make_mask(clip_feat)
# Pre-process Language Feature
lang_feat = self.embedding(ques_ix)
lang_feat, _ = self.lstm(lang_feat)
# Pre-process Video Feature
frame_feat = self.frame_feat_linear(frame_feat)
clip_feat = self.clip_feat_linear(clip_feat)
# Backbone Framework
lang_feat, frame_feat, clip_feat = self.backbone(
lang_feat,
frame_feat,
clip_feat,
lang_feat_mask,
frame_feat_mask,
clip_feat_mask
)
lang_feat = self.attflat_lang(
lang_feat,
lang_feat_mask
)
frame_feat = self.attflat_frame(
frame_feat,
frame_feat_mask
)
clip_feat = self.attflat_clip(
clip_feat,
clip_feat_mask
)
proj_feat_0 = lang_feat + frame_feat + clip_feat
proj_feat_0 = self.proj_norm(proj_feat_0)
proj_feat_1 = torch.stack([lang_feat, frame_feat, clip_feat], dim=1)
proj_feat_1, _ = self.lstm_fusion(proj_feat_1)
proj_feat_1 = proj_feat_1.sum(1)
proj_feat_1 = self.proj_feat_1(proj_feat_1)
proj_feat_1 = self.proj_norm_lstm(proj_feat_1)
proj_feat = torch.sigmoid(self.proj(proj_feat_0 + proj_feat_1))
return proj_feat
# Masking
def make_mask(self, feature):
return (torch.sum(
torch.abs(feature),
dim=-1
) == 0).unsqueeze(1).unsqueeze(2)
class Net4(nn.Module):
def __init__(self, __C, pretrained_emb, token_size, answer_size):
super(Net4, self).__init__()
print('Training with Network type 4: MCAN')
self.embedding = nn.Embedding(
num_embeddings=token_size,
embedding_dim=__C.WORD_EMBED_SIZE
)
# Loading the GloVe embedding weights
if __C.USE_GLOVE:
self.embedding.weight.data.copy_(torch.from_numpy(pretrained_emb))
self.lstm = nn.LSTM(
input_size=__C.WORD_EMBED_SIZE,
hidden_size=__C.HIDDEN_SIZE,
num_layers=1,
batch_first=True
)
self.frame_feat_linear = nn.Linear(
__C.FRAME_FEAT_SIZE,
__C.HIDDEN_SIZE
)
self.clip_feat_linear = nn.Linear(
__C.CLIP_FEAT_SIZE,
__C.HIDDEN_SIZE
)
self.backbone = MCA_ED(__C)
self.attflat_lang = AttFlat(__C)
self.attflat_vid = AttFlat(__C)
self.proj_norm = LayerNorm(__C.FLAT_OUT_SIZE)
self.proj = nn.Linear(__C.FLAT_OUT_SIZE, answer_size)
def forward(self, frame_feat, clip_feat, ques_ix):
# Make mask
lang_feat_mask = self.make_mask(ques_ix.unsqueeze(2))
frame_feat_mask = self.make_mask(frame_feat)
clip_feat_mask = self.make_mask(clip_feat)
# Pre-process Language Feature
lang_feat = self.embedding(ques_ix)
lang_feat, _ = self.lstm(lang_feat)
# Pre-process Video Feature
frame_feat = self.frame_feat_linear(frame_feat)
clip_feat = self.clip_feat_linear(clip_feat)
# concat frame and clip features
vid_feat = torch.cat([frame_feat, clip_feat], dim=1)
vid_feat_mask = torch.cat([frame_feat_mask, clip_feat_mask], dim=-1)
# Backbone Framework
lang_feat, vid_feat = self.backbone(
lang_feat,
vid_feat,
lang_feat_mask,
vid_feat_mask,
)
lang_feat = self.attflat_lang(
lang_feat,
lang_feat_mask
)
vid_feat = self.attflat_vid(
vid_feat,
vid_feat_mask
)
proj_feat = lang_feat + vid_feat
proj_feat = self.proj_norm(proj_feat)
proj_feat = torch.sigmoid(self.proj(proj_feat))
return proj_feat
# Masking
def make_mask(self, feature):
return (torch.sum(
torch.abs(feature),
dim=-1
) == 0).unsqueeze(1).unsqueeze(2)

View file

@ -0,0 +1,62 @@
# --------------------------------------------------------
# mcan-vqa (Deep Modular Co-Attention Networks)
# Licensed under The MIT License [see LICENSE for details]
# Written by Yuhao Cui https://github.com/cuiyuhao1996
# --------------------------------------------------------
import torch.nn as nn
import os
import torch
class FC(nn.Module):
def __init__(self, in_size, out_size, dropout_r=0., use_relu=True):
super(FC, self).__init__()
self.dropout_r = dropout_r
self.use_relu = use_relu
self.linear = nn.Linear(in_size, out_size)
if use_relu:
self.relu = nn.ReLU(inplace=True)
if dropout_r > 0:
self.dropout = nn.Dropout(dropout_r)
def forward(self, x):
x = self.linear(x)
if self.use_relu:
x = self.relu(x)
if self.dropout_r > 0:
x = self.dropout(x)
return x
class MLP(nn.Module):
def __init__(self, in_size, mid_size, out_size, dropout_r=0., use_relu=True):
super(MLP, self).__init__()
self.fc = FC(in_size, mid_size, dropout_r=dropout_r, use_relu=use_relu)
self.linear = nn.Linear(mid_size, out_size)
def forward(self, x):
return self.linear(self.fc(x))
class LayerNorm(nn.Module):
def __init__(self, size, eps=1e-6):
super(LayerNorm, self).__init__()
self.eps = eps
self.a_2 = nn.Parameter(torch.ones(size))
self.b_2 = nn.Parameter(torch.zeros(size))
def forward(self, x):
mean = x.mean(-1, keepdim=True)
std = x.std(-1, keepdim=True)
return self.a_2 * (x - mean) / (std + self.eps) + self.b_2

98
code/core/model/optim.py Normal file
View file

@ -0,0 +1,98 @@
# --------------------------------------------------------
# mcan-vqa (Deep Modular Co-Attention Networks)
# Licensed under The MIT License [see LICENSE for details]
# Written by Yuhao Cui https://github.com/cuiyuhao1996
# --------------------------------------------------------
import torch
import torch.optim as Optim
class WarmupOptimizer(object):
def __init__(self, lr_base, optimizer, data_size, batch_size):
self.optimizer = optimizer
self._step = 0
self.lr_base = lr_base
self._rate = 0
self.data_size = data_size
self.batch_size = batch_size
def step(self):
self._step += 1
rate = self.rate()
for p in self.optimizer.param_groups:
p['lr'] = rate
self._rate = rate
self.optimizer.step()
def zero_grad(self):
self.optimizer.zero_grad()
def rate(self, step=None):
if step is None:
step = self._step
if step <= int(self.data_size / self.batch_size * 1):
r = self.lr_base * 1/4.
elif step <= int(self.data_size / self.batch_size * 2):
r = self.lr_base * 2/4.
elif step <= int(self.data_size / self.batch_size * 3):
r = self.lr_base * 3/4.
else:
r = self.lr_base
return r
def get_optim(__C, model, data_size, optimizer, lr_base=None):
if lr_base is None:
lr_base = __C.LR_BASE
# modules = model._modules
# params_list = []
# for m in modules:
# if 'dnc' in m:
# params_list.append({
# 'params': filter(lambda p: p.requires_grad, modules[m].parameters()),
# 'lr': __C.LR_DNC_BASE,
# 'flag': True
# })
# else:
# params_list.append({
# 'params': filter(lambda p: p.requires_grad, modules[m].parameters()),
# })
if optimizer == 'adam':
optim = Optim.Adam(
filter(lambda p: p.requires_grad, model.parameters()),
lr=0,
betas=__C.OPT_BETAS,
eps=__C.OPT_EPS,
)
elif optimizer == 'rmsprop':
optim = Optim.RMSprop(
filter(lambda p: p.requires_grad, model.parameters()),
lr=0,
eps=__C.OPT_EPS,
weight_decay=__C.OPT_WEIGHT_DECAY
)
else:
raise ValueError('{} optimizer is not supported'.fromat(optimizer))
return WarmupOptimizer(
lr_base,
optim,
data_size,
__C.BATCH_SIZE
)
def adjust_lr(optim, decay_r):
optim.lr_base *= decay_r
def adjust_lr_dnc(optim, decay_r):
optim.lr_dnc_base *= decay_r

163
code/core/model/utils.py Normal file
View file

@ -0,0 +1,163 @@
"""
PyTorch DNC implementation from
-->
https://github.com/ixaxaar/pytorch-dnc
<--
"""
import torch.nn as nn
import torch as T
import torch.nn.functional as F
import numpy as np
import torch
from torch.autograd import Variable
import re
import string
def recursiveTrace(obj):
print(type(obj))
if hasattr(obj, 'grad_fn'):
print(obj.grad_fn)
recursiveTrace(obj.grad_fn)
elif hasattr(obj, 'saved_variables'):
print(obj.requires_grad, len(obj.saved_tensors), len(obj.saved_variables))
[print(v) for v in obj.saved_variables]
[recursiveTrace(v.grad_fn) for v in obj.saved_variables]
def cuda(x, grad=False, gpu_id=-1):
x = x.float() if T.is_tensor(x) else x
if gpu_id == -1:
t = T.FloatTensor(x)
t.requires_grad=grad
return t
else:
t = T.FloatTensor(x.pin_memory()).cuda(gpu_id)
t.requires_grad=grad
return t
def cudavec(x, grad=False, gpu_id=-1):
if gpu_id == -1:
t = T.Tensor(T.from_numpy(x))
t.requires_grad = grad
return t
else:
t = T.Tensor(T.from_numpy(x).pin_memory()).cuda(gpu_id)
t.requires_grad = grad
return t
def cudalong(x, grad=False, gpu_id=-1):
if gpu_id == -1:
t = T.LongTensor(T.from_numpy(x.astype(np.long)))
t.requires_grad = grad
return t
else:
t = T.LongTensor(T.from_numpy(x.astype(np.long)).pin_memory()).cuda(gpu_id)
t.requires_grad = grad
return t
def θ(a, b, normBy=2):
"""Batchwise Cosine similarity
Cosine similarity
Arguments:
a {Tensor} -- A 3D Tensor (b * m * w)
b {Tensor} -- A 3D Tensor (b * r * w)
Returns:
Tensor -- Batchwise cosine similarity (b * r * m)
"""
dot = T.bmm(a, b.transpose(1,2))
a_norm = T.norm(a, normBy, dim=2).unsqueeze(2)
b_norm = T.norm(b, normBy, dim=2).unsqueeze(1)
cos = dot / (a_norm * b_norm + δ)
return cos.transpose(1,2).contiguous()
def σ(input, axis=1):
"""Softmax on an axis
Softmax on an axis
Arguments:
input {Tensor} -- input Tensor
Keyword Arguments:
axis {number} -- axis on which to take softmax on (default: {1})
Returns:
Tensor -- Softmax output Tensor
"""
input_size = input.size()
trans_input = input.transpose(axis, len(input_size) - 1)
trans_size = trans_input.size()
input_2d = trans_input.contiguous().view(-1, trans_size[-1])
soft_max_2d = F.softmax(input_2d, -1)
soft_max_nd = soft_max_2d.view(*trans_size)
return soft_max_nd.transpose(axis, len(input_size) - 1)
δ = 1e-6
def register_nan_checks(model):
def check_grad(module, grad_input, grad_output):
# print(module) you can add this to see that the hook is called
# print('hook called for ' + str(type(module)))
if any(np.all(np.isnan(gi.data.cpu().numpy())) for gi in grad_input if gi is not None):
print('NaN gradient in grad_input ' + type(module).__name__)
model.apply(lambda module: module.register_backward_hook(check_grad))
def apply_dict(dic):
for k, v in dic.items():
apply_var(v, k)
if isinstance(v, nn.Module):
key_list = [a for a in dir(v) if not a.startswith('__')]
for key in key_list:
apply_var(getattr(v, key), key)
for pk, pv in v._parameters.items():
apply_var(pv, pk)
def apply_var(v, k):
if isinstance(v, Variable) and v.requires_grad:
v.register_hook(check_nan_gradient(k))
def check_nan_gradient(name=''):
def f(tensor):
if np.isnan(T.mean(tensor).data.cpu().numpy()):
print('\nnan gradient of {} :'.format(name))
# print(tensor)
# assert 0, 'nan gradient'
return tensor
return f
def ptr(tensor):
if T.is_tensor(tensor):
return tensor.storage().data_ptr()
elif hasattr(tensor, 'data'):
return tensor.clone().data.storage().data_ptr()
else:
return tensor
# TODO: EWW change this shit
def ensure_gpu(tensor, gpu_id):
if "cuda" in str(type(tensor)) and gpu_id != -1:
return tensor.cuda(gpu_id)
elif "cuda" in str(type(tensor)):
return tensor.cpu()
elif "Tensor" in str(type(tensor)) and gpu_id != -1:
return tensor.cuda(gpu_id)
elif "Tensor" in str(type(tensor)):
return tensor
elif type(tensor) is np.ndarray:
return cudavec(tensor, gpu_id=gpu_id).data
else:
return tensor
def print_gradient(x, name):
s = "Gradient of " + name + " ----------------------------------"
x.register_hook(lambda y: print(s, y.squeeze()))