Initial commit

This commit is contained in:
Adnen Abdessaied 2022-03-30 10:46:35 +02:00
commit b5f3b728c3
53 changed files with 7008 additions and 0 deletions

0
core/model/.gitkeep Normal file
View file

80
core/model/C3D.py Normal file
View file

@ -0,0 +1,80 @@
"""
from https://github.com/DavideA/c3d-pytorch/blob/master/C3D_model.py
"""
import torch.nn as nn
class C3D(nn.Module):
"""
The C3D network as described in [1].
"""
def __init__(self):
super(C3D, self).__init__()
self.conv1 = nn.Conv3d(3, 64, kernel_size=(3, 3, 3), padding=(1, 1, 1))
self.pool1 = nn.MaxPool3d(kernel_size=(1, 2, 2), stride=(1, 2, 2))
self.conv2 = nn.Conv3d(64, 128, kernel_size=(3, 3, 3), padding=(1, 1, 1))
self.pool2 = nn.MaxPool3d(kernel_size=(2, 2, 2), stride=(2, 2, 2))
self.conv3a = nn.Conv3d(128, 256, kernel_size=(3, 3, 3), padding=(1, 1, 1))
self.conv3b = nn.Conv3d(256, 256, kernel_size=(3, 3, 3), padding=(1, 1, 1))
self.pool3 = nn.MaxPool3d(kernel_size=(2, 2, 2), stride=(2, 2, 2))
self.conv4a = nn.Conv3d(256, 512, kernel_size=(3, 3, 3), padding=(1, 1, 1))
self.conv4b = nn.Conv3d(512, 512, kernel_size=(3, 3, 3), padding=(1, 1, 1))
self.pool4 = nn.MaxPool3d(kernel_size=(2, 2, 2), stride=(2, 2, 2))
self.conv5a = nn.Conv3d(512, 512, kernel_size=(3, 3, 3), padding=(1, 1, 1))
self.conv5b = nn.Conv3d(512, 512, kernel_size=(3, 3, 3), padding=(1, 1, 1))
self.pool5 = nn.MaxPool3d(kernel_size=(2, 2, 2), stride=(2, 2, 2), padding=(0, 1, 1))
self.fc6 = nn.Linear(8192, 4096)
self.fc7 = nn.Linear(4096, 4096)
self.fc8 = nn.Linear(4096, 487)
self.dropout = nn.Dropout(p=0.5)
self.relu = nn.ReLU()
self.softmax = nn.Softmax()
def forward(self, x):
h = self.relu(self.conv1(x))
h = self.pool1(h)
h = self.relu(self.conv2(h))
h = self.pool2(h)
h = self.relu(self.conv3a(h))
h = self.relu(self.conv3b(h))
h = self.pool3(h)
h = self.relu(self.conv4a(h))
h = self.relu(self.conv4b(h))
h = self.pool4(h)
h = self.relu(self.conv5a(h))
h = self.relu(self.conv5b(h))
h = self.pool5(h)
h = h.view(-1, 8192)
h = self.relu(self.fc6(h))
h = self.dropout(h)
h = self.relu(self.fc7(h))
# h = self.dropout(h)
# logits = self.fc8(h)
# probs = self.softmax(logits)
return h
"""
References
----------
[1] Tran, Du, et al. "Learning spatiotemporal features with 3d convolutional networks."
Proceedings of the IEEE international conference on computer vision. 2015.
"""

323
core/model/dnc.py Normal file
View file

@ -0,0 +1,323 @@
"""
PyTorch DNC implementation from
-->
https://github.com/ixaxaar/pytorch-dnc
<--
"""
# -*- coding: utf-8 -*-
import torch.nn as nn
import torch as T
from torch.autograd import Variable as var
import numpy as np
from torch.nn.utils.rnn import pad_packed_sequence as pad
from torch.nn.utils.rnn import pack_padded_sequence as pack
from torch.nn.utils.rnn import PackedSequence
from .util import *
from .memory import *
from torch.nn.init import orthogonal_, xavier_uniform_
class DNC(nn.Module):
def __init__(
self,
input_size,
hidden_size,
rnn_type='lstm',
num_layers=1,
num_hidden_layers=2,
bias=True,
batch_first=True,
dropout=0,
bidirectional=False,
nr_cells=5,
read_heads=2,
cell_size=10,
nonlinearity='tanh',
gpu_id=-1,
independent_linears=False,
share_memory=True,
debug=False,
clip=20
):
super(DNC, self).__init__()
# todo: separate weights and RNNs for the interface and output vectors
self.input_size = input_size
self.hidden_size = hidden_size
self.rnn_type = rnn_type
self.num_layers = num_layers
self.num_hidden_layers = num_hidden_layers
self.bias = bias
self.batch_first = batch_first
self.dropout = dropout
self.bidirectional = bidirectional
self.nr_cells = nr_cells
self.read_heads = read_heads
self.cell_size = cell_size
self.nonlinearity = nonlinearity
self.gpu_id = gpu_id
self.independent_linears = independent_linears
self.share_memory = share_memory
self.debug = debug
self.clip = clip
self.w = self.cell_size
self.r = self.read_heads
self.read_vectors_size = self.r * self.w
self.output_size = self.hidden_size
self.nn_input_size = self.input_size + self.read_vectors_size
self.nn_output_size = self.output_size + self.read_vectors_size
self.rnns = []
self.memories = []
for layer in range(self.num_layers):
if self.rnn_type.lower() == 'rnn':
self.rnns.append(nn.RNN((self.nn_input_size if layer == 0 else self.nn_output_size), self.output_size,
bias=self.bias, nonlinearity=self.nonlinearity, batch_first=True, dropout=self.dropout, num_layers=self.num_hidden_layers))
elif self.rnn_type.lower() == 'gru':
self.rnns.append(nn.GRU((self.nn_input_size if layer == 0 else self.nn_output_size),
self.output_size, bias=self.bias, batch_first=True, dropout=self.dropout, num_layers=self.num_hidden_layers))
if self.rnn_type.lower() == 'lstm':
self.rnns.append(nn.LSTM((self.nn_input_size if layer == 0 else self.nn_output_size),
self.output_size, bias=self.bias, batch_first=True, dropout=self.dropout, num_layers=self.num_hidden_layers))
setattr(self, self.rnn_type.lower() + '_layer_' + str(layer), self.rnns[layer])
# memories for each layer
if not self.share_memory:
self.memories.append(
Memory(
input_size=self.output_size,
mem_size=self.nr_cells,
cell_size=self.w,
read_heads=self.r,
gpu_id=self.gpu_id,
independent_linears=self.independent_linears
)
)
setattr(self, 'rnn_layer_memory_' + str(layer), self.memories[layer])
# only one memory shared by all layers
if self.share_memory:
self.memories.append(
Memory(
input_size=self.output_size,
mem_size=self.nr_cells,
cell_size=self.w,
read_heads=self.r,
gpu_id=self.gpu_id,
independent_linears=self.independent_linears
)
)
setattr(self, 'rnn_layer_memory_shared', self.memories[0])
# final output layer
self.output = nn.Linear(self.nn_output_size, self.output_size)
orthogonal_(self.output.weight)
if self.gpu_id != -1:
[x.cuda(self.gpu_id) for x in self.rnns]
[x.cuda(self.gpu_id) for x in self.memories]
self.output.cuda()
def _init_hidden(self, hx, batch_size, reset_experience):
# create empty hidden states if not provided
if hx is None:
hx = (None, None, None)
(chx, mhx, last_read) = hx
# initialize hidden state of the controller RNN
if chx is None:
h = cuda(T.zeros(self.num_hidden_layers, batch_size, self.output_size), gpu_id=self.gpu_id)
xavier_uniform_(h)
chx = [ (h, h) if self.rnn_type.lower() == 'lstm' else h for x in range(self.num_layers)]
# Last read vectors
if last_read is None:
last_read = cuda(T.zeros(batch_size, self.w * self.r), gpu_id=self.gpu_id)
# memory states
if mhx is None:
if self.share_memory:
mhx = self.memories[0].reset(batch_size, erase=reset_experience)
else:
mhx = [m.reset(batch_size, erase=reset_experience) for m in self.memories]
else:
if self.share_memory:
mhx = self.memories[0].reset(batch_size, mhx, erase=reset_experience)
else:
mhx = [m.reset(batch_size, h, erase=reset_experience) for m, h in zip(self.memories, mhx)]
return chx, mhx, last_read
def _debug(self, mhx, debug_obj):
if not debug_obj:
debug_obj = {
'memory': [],
'link_matrix': [],
'precedence': [],
'read_weights': [],
'write_weights': [],
'usage_vector': [],
}
debug_obj['memory'].append(mhx['memory'][0].data.cpu().numpy())
debug_obj['link_matrix'].append(mhx['link_matrix'][0][0].data.cpu().numpy())
debug_obj['precedence'].append(mhx['precedence'][0].data.cpu().numpy())
debug_obj['read_weights'].append(mhx['read_weights'][0].data.cpu().numpy())
debug_obj['write_weights'].append(mhx['write_weights'][0].data.cpu().numpy())
debug_obj['usage_vector'].append(mhx['usage_vector'][0].unsqueeze(0).data.cpu().numpy())
return debug_obj
def _layer_forward(self, input, layer, hx=(None, None), pass_through_memory=True):
(chx, mhx) = hx
# pass through the controller layer
input, chx = self.rnns[layer](input.unsqueeze(1), chx)
input = input.squeeze(1)
# clip the controller output
if self.clip != 0:
output = T.clamp(input, -self.clip, self.clip)
else:
output = input
# the interface vector
ξ = output
# pass through memory
if pass_through_memory:
if self.share_memory:
read_vecs, mhx = self.memories[0](ξ, mhx)
else:
read_vecs, mhx = self.memories[layer](ξ, mhx)
# the read vectors
read_vectors = read_vecs.view(-1, self.w * self.r)
else:
read_vectors = None
return output, (chx, mhx, read_vectors)
def forward(self, input, hx=(None, None, None), reset_experience=False, pass_through_memory=True):
# handle packed data
is_packed = type(input) is PackedSequence
if is_packed:
input, lengths = pad(input)
max_length = lengths[0]
else:
max_length = input.size(1) if self.batch_first else input.size(0)
lengths = [input.size(1)] * max_length if self.batch_first else [input.size(0)] * max_length
batch_size = input.size(0) if self.batch_first else input.size(1)
if not self.batch_first:
input = input.transpose(0, 1)
# make the data time-first
controller_hidden, mem_hidden, last_read = self._init_hidden(hx, batch_size, reset_experience)
# concat input with last read (or padding) vectors
inputs = [T.cat([input[:, x, :], last_read], 1) for x in range(max_length)]
# batched forward pass per element / word / etc
if self.debug:
viz = None
outs = [None] * max_length
read_vectors = None
rv = [None] * max_length
# pass through time
for time in range(max_length):
# pass thorugh layers
for layer in range(self.num_layers):
# this layer's hidden states
chx = controller_hidden[layer]
m = mem_hidden if self.share_memory else mem_hidden[layer]
# pass through controller
outs[time], (chx, m, read_vectors) = \
self._layer_forward(inputs[time], layer, (chx, m), pass_through_memory)
# debug memory
if self.debug:
viz = self._debug(m, viz)
# store the memory back (per layer or shared)
if self.share_memory:
mem_hidden = m
else:
mem_hidden[layer] = m
controller_hidden[layer] = chx
if read_vectors is not None:
# the controller output + read vectors go into next layer
outs[time] = T.cat([outs[time], read_vectors], 1)
if layer == self.num_layers - 1:
rv[time] = read_vectors.reshape(batch_size, self.r, self.w)
else:
outs[time] = T.cat([outs[time], last_read], 1)
inputs[time] = outs[time]
if self.debug:
viz = {k: np.array(v) for k, v in viz.items()}
viz = {k: v.reshape(v.shape[0], v.shape[1] * v.shape[2]) for k, v in viz.items()}
# pass through final output layer
inputs = [self.output(i) for i in inputs]
outputs = T.stack(inputs, 1 if self.batch_first else 0)
if is_packed:
outputs = pack(output, lengths)
if self.debug:
return outputs, (controller_hidden, mem_hidden, read_vectors), rv, viz
else:
return outputs, (controller_hidden, mem_hidden, read_vectors), rv
def __repr__(self):
s = "\n----------------------------------------\n"
s += '{name}({input_size}, {hidden_size}'
if self.rnn_type != 'lstm':
s += ', rnn_type={rnn_type}'
if self.num_layers != 1:
s += ', num_layers={num_layers}'
if self.num_hidden_layers != 2:
s += ', num_hidden_layers={num_hidden_layers}'
if self.bias != True:
s += ', bias={bias}'
if self.batch_first != True:
s += ', batch_first={batch_first}'
if self.dropout != 0:
s += ', dropout={dropout}'
if self.bidirectional != False:
s += ', bidirectional={bidirectional}'
if self.nr_cells != 5:
s += ', nr_cells={nr_cells}'
if self.read_heads != 2:
s += ', read_heads={read_heads}'
if self.cell_size != 10:
s += ', cell_size={cell_size}'
if self.nonlinearity != 'tanh':
s += ', nonlinearity={nonlinearity}'
if self.gpu_id != -1:
s += ', gpu_id={gpu_id}'
if self.independent_linears != False:
s += ', independent_linears={independent_linears}'
if self.share_memory != True:
s += ', share_memory={share_memory}'
if self.debug != False:
s += ', debug={debug}'
if self.clip != 20:
s += ', clip={clip}'
s += ")\n" + super(DNC, self).__repr__() + \
"\n----------------------------------------\n"
return s.format(name=self.__class__.__name__, **self.__dict__)

208
core/model/mca.py Normal file
View file

@ -0,0 +1,208 @@
# --------------------------------------------------------
# mcan-vqa (Deep Modular Co-Attention Networks)
# Licensed under The MIT License [see LICENSE for details]
# Written by Yuhao Cui https://github.com/cuiyuhao1996
# --------------------------------------------------------
from core.model.net_utils import FC, MLP, LayerNorm
from core.model.dnc_improved import DNC, SharedMemDNC
from core.model.dnc_improved import FeedforwardController
import torch.nn as nn
import torch.nn.functional as F
import torch, math
import time
# ------------------------------
# ---- Multi-Head Attention ----
# ------------------------------
class MHAtt(nn.Module):
def __init__(self, __C):
super(MHAtt, self).__init__()
self.__C = __C
self.linear_v = nn.Linear(__C.HIDDEN_SIZE, __C.HIDDEN_SIZE)
self.linear_k = nn.Linear(__C.HIDDEN_SIZE, __C.HIDDEN_SIZE)
self.linear_q = nn.Linear(__C.HIDDEN_SIZE, __C.HIDDEN_SIZE)
self.linear_merge = nn.Linear(__C.HIDDEN_SIZE, __C.HIDDEN_SIZE)
self.dropout = nn.Dropout(__C.DROPOUT_R)
def forward(self, v, k, q, mask):
n_batches = q.size(0)
v = self.linear_v(v).view(
n_batches,
-1,
self.__C.MULTI_HEAD,
self.__C.HIDDEN_SIZE_HEAD
).transpose(1, 2)
k = self.linear_k(k).view(
n_batches,
-1,
self.__C.MULTI_HEAD,
self.__C.HIDDEN_SIZE_HEAD
).transpose(1, 2)
q = self.linear_q(q).view(
n_batches,
-1,
self.__C.MULTI_HEAD,
self.__C.HIDDEN_SIZE_HEAD
).transpose(1, 2)
atted = self.att(v, k, q, mask)
atted = atted.transpose(1, 2).contiguous().view(
n_batches,
-1,
self.__C.HIDDEN_SIZE
)
atted = self.linear_merge(atted)
return atted
def att(self, value, key, query, mask):
d_k = query.size(-1)
scores = torch.matmul(
query, key.transpose(-2, -1)
) / math.sqrt(d_k)
if mask is not None:
scores = scores.masked_fill(mask, -1e9)
att_map = F.softmax(scores, dim=-1)
att_map = self.dropout(att_map)
return torch.matmul(att_map, value)
# ---------------------------
# ---- Feed Forward Nets ----
# ---------------------------
class FFN(nn.Module):
def __init__(self, __C):
super(FFN, self).__init__()
self.mlp = MLP(
in_size=__C.HIDDEN_SIZE,
mid_size=__C.FF_SIZE,
out_size=__C.HIDDEN_SIZE,
dropout_r=__C.DROPOUT_R,
use_relu=True
)
def forward(self, x):
return self.mlp(x)
# ------------------------
# ---- Self Attention ----
# ------------------------
class SA(nn.Module):
def __init__(self, __C):
super(SA, self).__init__()
self.mhatt = MHAtt(__C)
self.ffn = FFN(__C)
self.dropout1 = nn.Dropout(__C.DROPOUT_R)
self.norm1 = LayerNorm(__C.HIDDEN_SIZE)
self.dropout2 = nn.Dropout(__C.DROPOUT_R)
self.norm2 = LayerNorm(__C.HIDDEN_SIZE)
def forward(self, x, x_mask):
x = self.norm1(x + self.dropout1(
self.mhatt(x, x, x, x_mask)
))
x = self.norm2(x + self.dropout2(
self.ffn(x)
))
return x
# -------------------------------
# ---- Self Guided Attention ----
# -------------------------------
class SGA(nn.Module):
def __init__(self, __C):
super(SGA, self).__init__()
self.mhatt1 = MHAtt(__C)
self.mhatt2 = MHAtt(__C)
self.ffn = FFN(__C)
self.dropout1 = nn.Dropout(__C.DROPOUT_R)
self.norm1 = LayerNorm(__C.HIDDEN_SIZE)
self.dropout2 = nn.Dropout(__C.DROPOUT_R)
self.norm2 = LayerNorm(__C.HIDDEN_SIZE)
self.dropout3 = nn.Dropout(__C.DROPOUT_R)
self.norm3 = LayerNorm(__C.HIDDEN_SIZE)
def forward(self, x, y, x_mask, y_mask):
x = self.norm1(x + self.dropout1(
self.mhatt1(x, x, x, x_mask)
))
x = self.norm2(x + self.dropout2(
self.mhatt2(y, y, x, y_mask)
))
x = self.norm3(x + self.dropout3(
self.ffn(x)
))
return x
# ------------------------------------------------
# ---- MAC Layers Cascaded by Encoder-Decoder ----
# ------------------------------------------------
class MCA_ED(nn.Module):
def __init__(self, __C):
super(MCA_ED, self).__init__()
self.enc_list = nn.ModuleList([SA(__C) for _ in range(__C.LAYER)])
self.dec_list = nn.ModuleList([SGA(__C) for _ in range(__C.LAYER)])
def forward(self, x, y, x_mask, y_mask):
# Get hidden vector
for enc in self.enc_list:
x = enc(x, x_mask)
for dec in self.dec_list:
y = dec(y, x, y_mask, x_mask)
return x, y
class VLC(nn.Module):
def __init__(self, __C):
super(VLC, self).__init__()
self.enc_list = nn.ModuleList([SA(__C) for _ in range(__C.LAYER)])
self.dec_lang_frames_list = nn.ModuleList([SGA(__C) for _ in range(__C.LAYER)])
self.dec_lang_clips_list = nn.ModuleList([SGA(__C) for _ in range(__C.LAYER)])
def forward(self, x, y, z, x_mask, y_mask, z_mask):
# Get hidden vector
for enc in self.enc_list:
x = enc(x, x_mask)
for dec in self.dec_lang_frames_list:
y = dec(y, x, y_mask, x_mask)
for dec in self.dec_lang_clips_list:
z = dec(z, x, z_mask, x_mask)
return x, y, z

314
core/model/memory.py Normal file
View file

@ -0,0 +1,314 @@
"""
PyTorch DNC implementation from
-->
https://github.com/ixaxaar/pytorch-dnc
<--
"""
# -*- coding: utf-8 -*-
import torch.nn as nn
import torch as T
from torch.autograd import Variable as var
import torch.nn.functional as F
import numpy as np
from core.model.util import *
class Memory(nn.Module):
def __init__(self, input_size, mem_size=512, cell_size=32, read_heads=4, gpu_id=-1, independent_linears=True):
super(Memory, self).__init__()
self.input_size = input_size
self.mem_size = mem_size
self.cell_size = cell_size
self.read_heads = read_heads
self.gpu_id = gpu_id
self.independent_linears = independent_linears
m = self.mem_size
w = self.cell_size
r = self.read_heads
if self.independent_linears:
self.read_keys_transform = nn.Linear(self.input_size, w * r)
self.read_strengths_transform = nn.Linear(self.input_size, r)
self.write_key_transform = nn.Linear(self.input_size, w)
self.write_strength_transform = nn.Linear(self.input_size, 1)
self.erase_vector_transform = nn.Linear(self.input_size, w)
self.write_vector_transform = nn.Linear(self.input_size, w)
self.free_gates_transform = nn.Linear(self.input_size, r)
self.allocation_gate_transform = nn.Linear(self.input_size, 1)
self.write_gate_transform = nn.Linear(self.input_size, 1)
self.read_modes_transform = nn.Linear(self.input_size, 3 * r)
else:
self.interface_size = (w * r) + (3 * w) + (5 * r) + 3
self.interface_weights = nn.Linear(
self.input_size, self.interface_size)
self.I = cuda(1 - T.eye(m).unsqueeze(0),
gpu_id=self.gpu_id) # (1 * n * n)
def reset(self, batch_size=1, hidden=None, erase=True):
m = self.mem_size
w = self.cell_size
r = self.read_heads
b = batch_size
if hidden is None:
return {
'memory': cuda(T.zeros(b, m, w).fill_(0), gpu_id=self.gpu_id),
'link_matrix': cuda(T.zeros(b, 1, m, m), gpu_id=self.gpu_id),
'precedence': cuda(T.zeros(b, 1, m), gpu_id=self.gpu_id),
'read_weights': cuda(T.zeros(b, r, m).fill_(0), gpu_id=self.gpu_id),
'write_weights': cuda(T.zeros(b, 1, m).fill_(0), gpu_id=self.gpu_id),
'usage_vector': cuda(T.zeros(b, m), gpu_id=self.gpu_id),
# 'free_gates': cuda(T.zeros(b, r), gpu_id=self.gpu_id),
# 'alloc_gates': cuda(T.zeros(b, 1), gpu_id=self.gpu_id),
# 'write_gates': cuda(T.zeros(b, 1), gpu_id=self.gpu_id),
# 'read_modes': cuda(T.zeros(b, r, 3), gpu_id=self.gpu_id)
}
else:
hidden['memory'] = hidden['memory'].clone()
hidden['link_matrix'] = hidden['link_matrix'].clone()
hidden['precedence'] = hidden['precedence'].clone()
hidden['read_weights'] = hidden['read_weights'].clone()
hidden['write_weights'] = hidden['write_weights'].clone()
hidden['usage_vector'] = hidden['usage_vector'].clone()
# hidden['free_gates'] = hidden['free_gates'].clone()
# hidden['alloc_gates'] = hidden['alloc_gates'].clone()
# hidden['write_gates'] = hidden['write_gates'].clone()
# hidden['read_modes'] = hidden['read_modes'].clone()
if erase:
hidden['memory'].data.fill_(0)
hidden['link_matrix'].data.zero_()
hidden['precedence'].data.zero_()
hidden['read_weights'].data.fill_(0)
hidden['write_weights'].data.fill_(0)
hidden['usage_vector'].data.zero_()
# hidden['free_gates'].data.fill_()
# hidden['alloc_gates'].data.fill_()
# hidden['write_gates'].data.fill_()
# hidden['read_modes'].data.fill_()
return hidden
def get_usage_vector(self, usage, free_gates, read_weights, write_weights):
# write_weights = write_weights.detach() # detach from the computation graph
# if read_weights.size(0) > free_gates.size(0):
# read_weights = read_weights[:free_gates.size(0), :, :]
# if usage.size(0) > free_gates.size(0):
# usage = usage[:free_gates.size(0), :]
# if write_weights.size(0) > free_gates.size(0):
# write_weights = write_weights[:free_gates.size(0), :, :]
usage = usage + (1 - usage) * (1 - T.prod(1 - write_weights, 1))
ψ = T.prod(1 - free_gates.unsqueeze(2) * read_weights, 1)
return usage * ψ
def allocate(self, usage, write_gate):
# ensure values are not too small prior to cumprod.
usage = δ + (1 - δ) * usage
batch_size = usage.size(0)
# free list
sorted_usage, φ = T.topk(usage, self.mem_size, dim=1, largest=False)
# cumprod with exclusive=True
# https://discuss.pytorch.org/t/cumprod-exclusive-true-equivalences/2614/8
v = var(sorted_usage.data.new(batch_size, 1).fill_(1))
cat_sorted_usage = T.cat((v, sorted_usage), 1)
prod_sorted_usage = T.cumprod(cat_sorted_usage, 1)[:, :-1]
sorted_allocation_weights = (1 - sorted_usage) * prod_sorted_usage.squeeze()
# construct the reverse sorting index https://stackoverflow.com/questions/2483696/undo-or-reverse-argsort-python
_, φ_rev = T.topk(φ, k=self.mem_size, dim=1, largest=False)
allocation_weights = sorted_allocation_weights.gather(1, φ_rev.long())
return allocation_weights.unsqueeze(1), usage
def write_weighting(self, memory, write_content_weights, allocation_weights, write_gate, allocation_gate):
ag = allocation_gate.unsqueeze(-1)
wg = write_gate.unsqueeze(-1)
return wg * (ag * allocation_weights + (1 - ag) * write_content_weights)
def get_link_matrix(self, link_matrix, write_weights, precedence):
precedence = precedence.unsqueeze(2)
write_weights_i = write_weights.unsqueeze(3)
write_weights_j = write_weights.unsqueeze(2)
prev_scale = 1 - write_weights_i - write_weights_j
new_link_matrix = write_weights_i * precedence
link_matrix = prev_scale * link_matrix + new_link_matrix
# trick to delete diag elems
return self.I.expand_as(link_matrix) * link_matrix
def update_precedence(self, precedence, write_weights):
return (1 - T.sum(write_weights, 2, keepdim=True)) * precedence + write_weights
def write(self, write_key, write_vector, erase_vector, free_gates, read_strengths, write_strength, write_gate, allocation_gate, hidden):
# get current usage
hidden['usage_vector'] = self.get_usage_vector(
hidden['usage_vector'],
free_gates,
hidden['read_weights'],
hidden['write_weights']
)
# lookup memory with write_key and write_strength
write_content_weights = self.content_weightings(
hidden['memory'], write_key, write_strength)
# get memory allocation
alloc, _ = self.allocate(
hidden['usage_vector'],
allocation_gate * write_gate
)
# get write weightings
hidden['write_weights'] = self.write_weighting(
hidden['memory'],
write_content_weights,
alloc,
write_gate,
allocation_gate
)
weighted_resets = hidden['write_weights'].unsqueeze(
3) * erase_vector.unsqueeze(2)
reset_gate = T.prod(1 - weighted_resets, 1)
# Update memory
hidden['memory'] = hidden['memory'] * reset_gate
hidden['memory'] = hidden['memory'] + \
T.bmm(hidden['write_weights'].transpose(1, 2), write_vector)
# update link_matrix
hidden['link_matrix'] = self.get_link_matrix(
hidden['link_matrix'],
hidden['write_weights'],
hidden['precedence']
)
hidden['precedence'] = self.update_precedence(
hidden['precedence'], hidden['write_weights'])
return hidden
def content_weightings(self, memory, keys, strengths):
# if memory.size(0) > keys.size(0):
# memory = memory[:keys.size(0), :, :]
d = θ(memory, keys)
return σ(d * strengths.unsqueeze(2), 2)
def directional_weightings(self, link_matrix, read_weights):
rw = read_weights.unsqueeze(1)
f = T.matmul(link_matrix, rw.transpose(2, 3)).transpose(2, 3)
b = T.matmul(rw, link_matrix)
return f.transpose(1, 2), b.transpose(1, 2)
def read_weightings(self, memory, content_weights, link_matrix, read_modes, read_weights):
forward_weight, backward_weight = self.directional_weightings(
link_matrix, read_weights)
content_mode = read_modes[:, :, 2].contiguous(
).unsqueeze(2) * content_weights
backward_mode = T.sum(
read_modes[:, :, 0:1].contiguous().unsqueeze(3) * backward_weight, 2)
forward_mode = T.sum(
read_modes[:, :, 1:2].contiguous().unsqueeze(3) * forward_weight, 2)
return backward_mode + content_mode + forward_mode
def read_vectors(self, memory, read_weights):
return T.bmm(read_weights, memory)
def read(self, read_keys, read_strengths, read_modes, hidden):
content_weights = self.content_weightings(
hidden['memory'], read_keys, read_strengths)
hidden['read_weights'] = self.read_weightings(
hidden['memory'],
content_weights,
hidden['link_matrix'],
read_modes,
hidden['read_weights']
)
read_vectors = self.read_vectors(
hidden['memory'], hidden['read_weights'])
return read_vectors, hidden
def forward(self, ξ, hidden):
# ξ = ξ.detach()
m = self.mem_size
w = self.cell_size
r = self.read_heads
b = ξ.size()[0]
if self.independent_linears:
# r read keys (b * r * w)
read_keys = self.read_keys_transform(ξ).view(b, r, w)
# r read strengths (b * r)
read_strengths = F.softplus(
self.read_strengths_transform(ξ).view(b, r))
# write key (b * 1 * w)
write_key = self.write_key_transform(ξ).view(b, 1, w)
# write strength (b * 1)
write_strength = F.softplus(
self.write_strength_transform(ξ).view(b, 1))
# erase vector (b * 1 * w)
erase_vector = T.sigmoid(
self.erase_vector_transform(ξ).view(b, 1, w))
# write vector (b * 1 * w)
write_vector = self.write_vector_transform(ξ).view(b, 1, w)
# r free gates (b * r)
free_gates = T.sigmoid(self.free_gates_transform(ξ).view(b, r))
# allocation gate (b * 1)
allocation_gate = T.sigmoid(
self.allocation_gate_transform(ξ).view(b, 1))
# write gate (b * 1)
write_gate = T.sigmoid(self.write_gate_transform(ξ).view(b, 1))
# read modes (b * r * 3)
read_modes = σ(self.read_modes_transform(ξ).view(b, r, 3), -1)
else:
ξ = self.interface_weights(ξ)
# r read keys (b * w * r)
read_keys = ξ[:, :r * w].contiguous().view(b, r, w)
# r read strengths (b * r)
read_strengths = F.softplus(
ξ[:, r * w:r * w + r].contiguous().view(b, r))
# write key (b * w * 1)
write_key = ξ[:, r * w + r:r * w + r + w].contiguous().view(b, 1, w)
# write strength (b * 1)
write_strength = F.softplus(
ξ[:, r * w + r + w].contiguous().view(b, 1))
# erase vector (b * w)
erase_vector = T.sigmoid(
ξ[:, r * w + r + w + 1: r * w + r + 2 * w + 1].contiguous().view(b, 1, w))
# write vector (b * w)
write_vector = ξ[:, r * w + r + 2 * w + 1: r * w + r + 3 * w + 1].contiguous().view(b, 1, w)
# r free gates (b * r)
free_gates = T.sigmoid(
ξ[:, r * w + r + 3 * w + 1: r * w + 2 * r + 3 * w + 1].contiguous().view(b, r))
# allocation gate (b * 1)
allocation_gate = T.sigmoid(
ξ[:, r * w + 2 * r + 3 * w + 1].contiguous().unsqueeze(1).view(b, 1))
# write gate (b * 1)
write_gate = T.sigmoid(
ξ[:, r * w + 2 * r + 3 * w + 2].contiguous()).unsqueeze(1).view(b, 1)
# read modes (b * 3*r)
read_modes = σ(ξ[:, r * w + 2 * r + 3 * w + 3: r *
w + 5 * r + 3 * w + 3].contiguous().view(b, r, 3), -1)
hidden = self.write(write_key, write_vector, erase_vector, free_gates,
read_strengths, write_strength, write_gate, allocation_gate, hidden)
hidden["free_gates"] = free_gates.clone().detach()
hidden["allocation_gate"] = allocation_gate.clone().detach()
hidden["write_gate"] = write_gate.clone().detach()
hidden["read_modes"] = read_modes.clone().detach()
return self.read(read_keys, read_strengths, read_modes, hidden)

501
core/model/net.py Normal file
View file

@ -0,0 +1,501 @@
# --------------------------------------------------------
# mcan-vqa (Deep Modular Co-Attention Networks)
# Licensed under The MIT License [see LICENSE for details]
# Written by Yuhao Cui https://github.com/cuiyuhao1996
# --------------------------------------------------------
from core.model.net_utils import FC, MLP, LayerNorm
from core.model.mca import SA, MCA_ED, VLC
from core.model.dnc import DNC
import torch.nn as nn
import torch.nn.functional as F
import torch
# ------------------------------
# ---- Flatten the sequence ----
# ------------------------------
class AttFlat(nn.Module):
def __init__(self, __C):
super(AttFlat, self).__init__()
self.__C = __C
self.mlp = MLP(
in_size=__C.HIDDEN_SIZE,
mid_size=__C.FLAT_MLP_SIZE,
out_size=__C.FLAT_GLIMPSES,
dropout_r=__C.DROPOUT_R,
use_relu=True
)
self.linear_merge = nn.Linear(
__C.HIDDEN_SIZE * __C.FLAT_GLIMPSES,
__C.FLAT_OUT_SIZE
)
def forward(self, x, x_mask):
att = self.mlp(x)
att = att.masked_fill(
x_mask.squeeze(1).squeeze(1).unsqueeze(2),
-1e9
)
att = F.softmax(att, dim=1)
att_list = []
for i in range(self.__C.FLAT_GLIMPSES):
att_list.append(
torch.sum(att[:, :, i: i + 1] * x, dim=1)
)
x_atted = torch.cat(att_list, dim=1)
x_atted = self.linear_merge(x_atted)
return x_atted
class AttFlatMem(AttFlat):
def __init__(self, __C):
super(AttFlatMem, self).__init__(__C)
self.__C = __C
def forward(self, x_mem, x, x_mask):
att = self.mlp(x_mem)
att = att.masked_fill(
x_mask.squeeze(1).squeeze(1).unsqueeze(2),
float('-inf')
)
att = F.softmax(att, dim=1)
att_list = []
for i in range(self.__C.FLAT_GLIMPSES):
att_list.append(
torch.sum(att[:, :, i: i + 1] * x, dim=1)
)
x_atted = torch.cat(att_list, dim=1)
x_atted = self.linear_merge(x_atted)
return x_atted
# -------------------------
# ---- Main MCAN Model ----
# -------------------------
class Net1(nn.Module):
def __init__(self, __C, pretrained_emb, token_size, answer_size):
super(Net1, self).__init__()
print('Training with Network type 1: VLCN')
self.pretrained_path = __C.PRETRAINED_PATH
self.embedding = nn.Embedding(
num_embeddings=token_size,
embedding_dim=__C.WORD_EMBED_SIZE
)
# Loading the GloVe embedding weights
if __C.USE_GLOVE:
self.embedding.weight.data.copy_(torch.from_numpy(pretrained_emb))
self.lstm = nn.LSTM(
input_size=__C.WORD_EMBED_SIZE,
hidden_size=__C.HIDDEN_SIZE,
num_layers=1,
batch_first=True
)
self.frame_feat_linear = nn.Linear(
__C.FRAME_FEAT_SIZE,
__C.HIDDEN_SIZE
)
self.clip_feat_linear = nn.Linear(
__C.CLIP_FEAT_SIZE,
__C.HIDDEN_SIZE
)
self.backbone = VLC(__C)
self.attflat_lang = AttFlat(__C)
self.attflat_frame = AttFlat(__C)
self.attflat_clip = AttFlat(__C)
self.dnc = DNC(
__C.FLAT_OUT_SIZE,
__C.FLAT_OUT_SIZE,
rnn_type='lstm',
num_layers=2,
num_hidden_layers=2,
bias=True,
batch_first=True,
dropout=0,
bidirectional=True,
nr_cells=__C.CELL_COUNT_DNC,
read_heads=__C.N_READ_HEADS_DNC,
cell_size=__C.WORD_LENGTH_DNC,
nonlinearity='tanh',
gpu_id=0,
independent_linears=False,
share_memory=False,
debug=False,
clip=20,
)
self.proj_norm = LayerNorm(__C.FLAT_OUT_SIZE)
self.proj_norm_dnc = LayerNorm(__C.FLAT_OUT_SIZE + __C.N_READ_HEADS_DNC * __C.WORD_LENGTH_DNC)
self.linear_dnc = FC(__C.FLAT_OUT_SIZE + __C.N_READ_HEADS_DNC * __C.WORD_LENGTH_DNC, __C.FLAT_OUT_SIZE, dropout_r=0.2)
self.proj = nn.Linear(__C.FLAT_OUT_SIZE, answer_size)
def forward(self, frame_feat, clip_feat, ques_ix):
# Make mask
lang_feat_mask = self.make_mask(ques_ix.unsqueeze(2))
frame_feat_mask = self.make_mask(frame_feat)
clip_feat_mask = self.make_mask(clip_feat)
# Pre-process Language Feature
lang_feat = self.embedding(ques_ix)
lang_feat, _ = self.lstm(lang_feat)
# Pre-process Video Feature
frame_feat = self.frame_feat_linear(frame_feat)
clip_feat = self.clip_feat_linear(clip_feat)
# Backbone Framework
lang_feat, frame_feat, clip_feat = self.backbone(
lang_feat,
frame_feat,
clip_feat,
lang_feat_mask,
frame_feat_mask,
clip_feat_mask
)
lang_feat = self.attflat_lang(
lang_feat,
lang_feat_mask
)
frame_feat = self.attflat_frame(
frame_feat,
frame_feat_mask
)
clip_feat = self.attflat_clip(
clip_feat,
clip_feat_mask
)
proj_feat_0 = lang_feat + frame_feat + clip_feat
proj_feat_0 = self.proj_norm(proj_feat_0)
proj_feat_1 = torch.stack([lang_feat, frame_feat, clip_feat], dim=1)
proj_feat_1, (_, _, rv), _ = self.dnc(proj_feat_1, (None, None, None), reset_experience=True, pass_through_memory=True)
proj_feat_1 = proj_feat_1.sum(1)
proj_feat_1 = torch.cat([proj_feat_1, rv], dim=-1)
proj_feat_1 = self.proj_norm_dnc(proj_feat_1)
proj_feat_1 = self.linear_dnc(proj_feat_1)
# proj_feat_1 = self.proj_norm(proj_feat_1)
proj_feat = torch.sigmoid(self.proj(proj_feat_0 + proj_feat_1))
return proj_feat
def load_pretrained_weights(self):
pretrained_msvd = torch.load(self.pretrained_path)['state_dict']
for n_pretrained, p_pretrained in pretrained_msvd.items():
if 'dnc' in n_pretrained:
self.state_dict()[n_pretrained].copy_(p_pretrained)
print('Pre-trained dnc-weights successfully loaded!')
# Masking
def make_mask(self, feature):
return (torch.sum(
torch.abs(feature),
dim=-1
) == 0).unsqueeze(1).unsqueeze(2)
class Net2(nn.Module):
def __init__(self, __C, pretrained_emb, token_size, answer_size):
super(Net2, self).__init__()
print('Training with Network type 2: VLCN-FLF')
self.embedding = nn.Embedding(
num_embeddings=token_size,
embedding_dim=__C.WORD_EMBED_SIZE
)
# Loading the GloVe embedding weights
if __C.USE_GLOVE:
self.embedding.weight.data.copy_(torch.from_numpy(pretrained_emb))
self.lstm = nn.LSTM(
input_size=__C.WORD_EMBED_SIZE,
hidden_size=__C.HIDDEN_SIZE,
num_layers=1,
batch_first=True
)
self.frame_feat_linear = nn.Linear(
__C.FRAME_FEAT_SIZE,
__C.HIDDEN_SIZE
)
self.clip_feat_linear = nn.Linear(
__C.CLIP_FEAT_SIZE,
__C.HIDDEN_SIZE
)
self.backbone = VLC(__C)
self.attflat_lang = AttFlat(__C)
self.attflat_frame = AttFlat(__C)
self.attflat_clip = AttFlat(__C)
self.proj_norm = LayerNorm(__C.FLAT_OUT_SIZE)
self.proj = nn.Linear(__C.FLAT_OUT_SIZE, answer_size)
def forward(self, frame_feat, clip_feat, ques_ix):
# Make mask
lang_feat_mask = self.make_mask(ques_ix.unsqueeze(2))
frame_feat_mask = self.make_mask(frame_feat)
clip_feat_mask = self.make_mask(clip_feat)
# Pre-process Language Feature
lang_feat = self.embedding(ques_ix)
lang_feat, _ = self.lstm(lang_feat)
# Pre-process Video Feature
frame_feat = self.frame_feat_linear(frame_feat)
clip_feat = self.clip_feat_linear(clip_feat)
# Backbone Framework
lang_feat, frame_feat, clip_feat = self.backbone(
lang_feat,
frame_feat,
clip_feat,
lang_feat_mask,
frame_feat_mask,
clip_feat_mask
)
lang_feat = self.attflat_lang(
lang_feat,
lang_feat_mask
)
frame_feat = self.attflat_frame(
frame_feat,
frame_feat_mask
)
clip_feat = self.attflat_clip(
clip_feat,
clip_feat_mask
)
proj_feat = lang_feat + frame_feat + clip_feat
proj_feat = self.proj_norm(proj_feat)
proj_feat = torch.sigmoid(self.proj(proj_feat))
return proj_feat
# Masking
def make_mask(self, feature):
return (torch.sum(
torch.abs(feature),
dim=-1
) == 0).unsqueeze(1).unsqueeze(2)
class Net3(nn.Module):
def __init__(self, __C, pretrained_emb, token_size, answer_size):
super(Net3, self).__init__()
print('Training with Network type 3: VLCN+LSTM')
self.embedding = nn.Embedding(
num_embeddings=token_size,
embedding_dim=__C.WORD_EMBED_SIZE
)
# Loading the GloVe embedding weights
if __C.USE_GLOVE:
self.embedding.weight.data.copy_(torch.from_numpy(pretrained_emb))
self.lstm = nn.LSTM(
input_size=__C.WORD_EMBED_SIZE,
hidden_size=__C.HIDDEN_SIZE,
num_layers=1,
batch_first=True
)
self.frame_feat_linear = nn.Linear(
__C.FRAME_FEAT_SIZE,
__C.HIDDEN_SIZE
)
self.clip_feat_linear = nn.Linear(
__C.CLIP_FEAT_SIZE,
__C.HIDDEN_SIZE
)
self.backbone = VLC(__C)
self.attflat_lang = AttFlat(__C)
self.attflat_frame = AttFlat(__C)
self.attflat_clip = AttFlat(__C)
self.lstm_fusion = nn.LSTM(
input_size=__C.FLAT_OUT_SIZE,
hidden_size=__C.FLAT_OUT_SIZE,
num_layers=2,
batch_first=True,
bidirectional=True
)
self.proj_norm = LayerNorm(__C.FLAT_OUT_SIZE)
self.proj_feat_1 = nn.Linear(__C.FLAT_OUT_SIZE * 2, __C.FLAT_OUT_SIZE)
self.proj_norm_lstm = LayerNorm(__C.FLAT_OUT_SIZE)
self.proj = nn.Linear(__C.FLAT_OUT_SIZE, answer_size)
def forward(self, frame_feat, clip_feat, ques_ix):
# Make mask
lang_feat_mask = self.make_mask(ques_ix.unsqueeze(2))
frame_feat_mask = self.make_mask(frame_feat)
clip_feat_mask = self.make_mask(clip_feat)
# Pre-process Language Feature
lang_feat = self.embedding(ques_ix)
lang_feat, _ = self.lstm(lang_feat)
# Pre-process Video Feature
frame_feat = self.frame_feat_linear(frame_feat)
clip_feat = self.clip_feat_linear(clip_feat)
# Backbone Framework
lang_feat, frame_feat, clip_feat = self.backbone(
lang_feat,
frame_feat,
clip_feat,
lang_feat_mask,
frame_feat_mask,
clip_feat_mask
)
lang_feat = self.attflat_lang(
lang_feat,
lang_feat_mask
)
frame_feat = self.attflat_frame(
frame_feat,
frame_feat_mask
)
clip_feat = self.attflat_clip(
clip_feat,
clip_feat_mask
)
proj_feat_0 = lang_feat + frame_feat + clip_feat
proj_feat_0 = self.proj_norm(proj_feat_0)
proj_feat_1 = torch.stack([lang_feat, frame_feat, clip_feat], dim=1)
proj_feat_1, _ = self.lstm_fusion(proj_feat_1)
proj_feat_1 = proj_feat_1.sum(1)
proj_feat_1 = self.proj_feat_1(proj_feat_1)
proj_feat_1 = self.proj_norm_lstm(proj_feat_1)
proj_feat = torch.sigmoid(self.proj(proj_feat_0 + proj_feat_1))
return proj_feat
# Masking
def make_mask(self, feature):
return (torch.sum(
torch.abs(feature),
dim=-1
) == 0).unsqueeze(1).unsqueeze(2)
class Net4(nn.Module):
def __init__(self, __C, pretrained_emb, token_size, answer_size):
super(Net4, self).__init__()
print('Training with Network type 4: MCAN')
self.embedding = nn.Embedding(
num_embeddings=token_size,
embedding_dim=__C.WORD_EMBED_SIZE
)
# Loading the GloVe embedding weights
if __C.USE_GLOVE:
self.embedding.weight.data.copy_(torch.from_numpy(pretrained_emb))
self.lstm = nn.LSTM(
input_size=__C.WORD_EMBED_SIZE,
hidden_size=__C.HIDDEN_SIZE,
num_layers=1,
batch_first=True
)
self.frame_feat_linear = nn.Linear(
__C.FRAME_FEAT_SIZE,
__C.HIDDEN_SIZE
)
self.clip_feat_linear = nn.Linear(
__C.CLIP_FEAT_SIZE,
__C.HIDDEN_SIZE
)
self.backbone = MCA_ED(__C)
self.attflat_lang = AttFlat(__C)
self.attflat_vid = AttFlat(__C)
self.proj_norm = LayerNorm(__C.FLAT_OUT_SIZE)
self.proj = nn.Linear(__C.FLAT_OUT_SIZE, answer_size)
def forward(self, frame_feat, clip_feat, ques_ix):
# Make mask
lang_feat_mask = self.make_mask(ques_ix.unsqueeze(2))
frame_feat_mask = self.make_mask(frame_feat)
clip_feat_mask = self.make_mask(clip_feat)
# Pre-process Language Feature
lang_feat = self.embedding(ques_ix)
lang_feat, _ = self.lstm(lang_feat)
# Pre-process Video Feature
frame_feat = self.frame_feat_linear(frame_feat)
clip_feat = self.clip_feat_linear(clip_feat)
# concat frame and clip features
vid_feat = torch.cat([frame_feat, clip_feat], dim=1)
vid_feat_mask = torch.cat([frame_feat_mask, clip_feat_mask], dim=-1)
# Backbone Framework
lang_feat, vid_feat = self.backbone(
lang_feat,
vid_feat,
lang_feat_mask,
vid_feat_mask,
)
lang_feat = self.attflat_lang(
lang_feat,
lang_feat_mask
)
vid_feat = self.attflat_vid(
vid_feat,
vid_feat_mask
)
proj_feat = lang_feat + vid_feat
proj_feat = self.proj_norm(proj_feat)
proj_feat = torch.sigmoid(self.proj(proj_feat))
return proj_feat
# Masking
def make_mask(self, feature):
return (torch.sum(
torch.abs(feature),
dim=-1
) == 0).unsqueeze(1).unsqueeze(2)

62
core/model/net_utils.py Normal file
View file

@ -0,0 +1,62 @@
# --------------------------------------------------------
# mcan-vqa (Deep Modular Co-Attention Networks)
# Licensed under The MIT License [see LICENSE for details]
# Written by Yuhao Cui https://github.com/cuiyuhao1996
# --------------------------------------------------------
import torch.nn as nn
import os
import torch
class FC(nn.Module):
def __init__(self, in_size, out_size, dropout_r=0., use_relu=True):
super(FC, self).__init__()
self.dropout_r = dropout_r
self.use_relu = use_relu
self.linear = nn.Linear(in_size, out_size)
if use_relu:
self.relu = nn.ReLU(inplace=True)
if dropout_r > 0:
self.dropout = nn.Dropout(dropout_r)
def forward(self, x):
x = self.linear(x)
if self.use_relu:
x = self.relu(x)
if self.dropout_r > 0:
x = self.dropout(x)
return x
class MLP(nn.Module):
def __init__(self, in_size, mid_size, out_size, dropout_r=0., use_relu=True):
super(MLP, self).__init__()
self.fc = FC(in_size, mid_size, dropout_r=dropout_r, use_relu=use_relu)
self.linear = nn.Linear(mid_size, out_size)
def forward(self, x):
return self.linear(self.fc(x))
class LayerNorm(nn.Module):
def __init__(self, size, eps=1e-6):
super(LayerNorm, self).__init__()
self.eps = eps
self.a_2 = nn.Parameter(torch.ones(size))
self.b_2 = nn.Parameter(torch.zeros(size))
def forward(self, x):
mean = x.mean(-1, keepdim=True)
std = x.std(-1, keepdim=True)
return self.a_2 * (x - mean) / (std + self.eps) + self.b_2

98
core/model/optim.py Normal file
View file

@ -0,0 +1,98 @@
# --------------------------------------------------------
# mcan-vqa (Deep Modular Co-Attention Networks)
# Licensed under The MIT License [see LICENSE for details]
# Written by Yuhao Cui https://github.com/cuiyuhao1996
# --------------------------------------------------------
import torch
import torch.optim as Optim
class WarmupOptimizer(object):
def __init__(self, lr_base, optimizer, data_size, batch_size):
self.optimizer = optimizer
self._step = 0
self.lr_base = lr_base
self._rate = 0
self.data_size = data_size
self.batch_size = batch_size
def step(self):
self._step += 1
rate = self.rate()
for p in self.optimizer.param_groups:
p['lr'] = rate
self._rate = rate
self.optimizer.step()
def zero_grad(self):
self.optimizer.zero_grad()
def rate(self, step=None):
if step is None:
step = self._step
if step <= int(self.data_size / self.batch_size * 1):
r = self.lr_base * 1/4.
elif step <= int(self.data_size / self.batch_size * 2):
r = self.lr_base * 2/4.
elif step <= int(self.data_size / self.batch_size * 3):
r = self.lr_base * 3/4.
else:
r = self.lr_base
return r
def get_optim(__C, model, data_size, optimizer, lr_base=None):
if lr_base is None:
lr_base = __C.LR_BASE
# modules = model._modules
# params_list = []
# for m in modules:
# if 'dnc' in m:
# params_list.append({
# 'params': filter(lambda p: p.requires_grad, modules[m].parameters()),
# 'lr': __C.LR_DNC_BASE,
# 'flag': True
# })
# else:
# params_list.append({
# 'params': filter(lambda p: p.requires_grad, modules[m].parameters()),
# })
if optimizer == 'adam':
optim = Optim.Adam(
filter(lambda p: p.requires_grad, model.parameters()),
lr=0,
betas=__C.OPT_BETAS,
eps=__C.OPT_EPS,
)
elif optimizer == 'rmsprop':
optim = Optim.RMSprop(
filter(lambda p: p.requires_grad, model.parameters()),
lr=0,
eps=__C.OPT_EPS,
weight_decay=__C.OPT_WEIGHT_DECAY
)
else:
raise ValueError('{} optimizer is not supported'.fromat(optimizer))
return WarmupOptimizer(
lr_base,
optim,
data_size,
__C.BATCH_SIZE
)
def adjust_lr(optim, decay_r):
optim.lr_base *= decay_r
def adjust_lr_dnc(optim, decay_r):
optim.lr_dnc_base *= decay_r

163
core/model/utils.py Normal file
View file

@ -0,0 +1,163 @@
"""
PyTorch DNC implementation from
-->
https://github.com/ixaxaar/pytorch-dnc
<--
"""
import torch.nn as nn
import torch as T
import torch.nn.functional as F
import numpy as np
import torch
from torch.autograd import Variable
import re
import string
def recursiveTrace(obj):
print(type(obj))
if hasattr(obj, 'grad_fn'):
print(obj.grad_fn)
recursiveTrace(obj.grad_fn)
elif hasattr(obj, 'saved_variables'):
print(obj.requires_grad, len(obj.saved_tensors), len(obj.saved_variables))
[print(v) for v in obj.saved_variables]
[recursiveTrace(v.grad_fn) for v in obj.saved_variables]
def cuda(x, grad=False, gpu_id=-1):
x = x.float() if T.is_tensor(x) else x
if gpu_id == -1:
t = T.FloatTensor(x)
t.requires_grad=grad
return t
else:
t = T.FloatTensor(x.pin_memory()).cuda(gpu_id)
t.requires_grad=grad
return t
def cudavec(x, grad=False, gpu_id=-1):
if gpu_id == -1:
t = T.Tensor(T.from_numpy(x))
t.requires_grad = grad
return t
else:
t = T.Tensor(T.from_numpy(x).pin_memory()).cuda(gpu_id)
t.requires_grad = grad
return t
def cudalong(x, grad=False, gpu_id=-1):
if gpu_id == -1:
t = T.LongTensor(T.from_numpy(x.astype(np.long)))
t.requires_grad = grad
return t
else:
t = T.LongTensor(T.from_numpy(x.astype(np.long)).pin_memory()).cuda(gpu_id)
t.requires_grad = grad
return t
def θ(a, b, normBy=2):
"""Batchwise Cosine similarity
Cosine similarity
Arguments:
a {Tensor} -- A 3D Tensor (b * m * w)
b {Tensor} -- A 3D Tensor (b * r * w)
Returns:
Tensor -- Batchwise cosine similarity (b * r * m)
"""
dot = T.bmm(a, b.transpose(1,2))
a_norm = T.norm(a, normBy, dim=2).unsqueeze(2)
b_norm = T.norm(b, normBy, dim=2).unsqueeze(1)
cos = dot / (a_norm * b_norm + δ)
return cos.transpose(1,2).contiguous()
def σ(input, axis=1):
"""Softmax on an axis
Softmax on an axis
Arguments:
input {Tensor} -- input Tensor
Keyword Arguments:
axis {number} -- axis on which to take softmax on (default: {1})
Returns:
Tensor -- Softmax output Tensor
"""
input_size = input.size()
trans_input = input.transpose(axis, len(input_size) - 1)
trans_size = trans_input.size()
input_2d = trans_input.contiguous().view(-1, trans_size[-1])
soft_max_2d = F.softmax(input_2d, -1)
soft_max_nd = soft_max_2d.view(*trans_size)
return soft_max_nd.transpose(axis, len(input_size) - 1)
δ = 1e-6
def register_nan_checks(model):
def check_grad(module, grad_input, grad_output):
# print(module) you can add this to see that the hook is called
# print('hook called for ' + str(type(module)))
if any(np.all(np.isnan(gi.data.cpu().numpy())) for gi in grad_input if gi is not None):
print('NaN gradient in grad_input ' + type(module).__name__)
model.apply(lambda module: module.register_backward_hook(check_grad))
def apply_dict(dic):
for k, v in dic.items():
apply_var(v, k)
if isinstance(v, nn.Module):
key_list = [a for a in dir(v) if not a.startswith('__')]
for key in key_list:
apply_var(getattr(v, key), key)
for pk, pv in v._parameters.items():
apply_var(pv, pk)
def apply_var(v, k):
if isinstance(v, Variable) and v.requires_grad:
v.register_hook(check_nan_gradient(k))
def check_nan_gradient(name=''):
def f(tensor):
if np.isnan(T.mean(tensor).data.cpu().numpy()):
print('\nnan gradient of {} :'.format(name))
# print(tensor)
# assert 0, 'nan gradient'
return tensor
return f
def ptr(tensor):
if T.is_tensor(tensor):
return tensor.storage().data_ptr()
elif hasattr(tensor, 'data'):
return tensor.clone().data.storage().data_ptr()
else:
return tensor
# TODO: EWW change this shit
def ensure_gpu(tensor, gpu_id):
if "cuda" in str(type(tensor)) and gpu_id != -1:
return tensor.cuda(gpu_id)
elif "cuda" in str(type(tensor)):
return tensor.cpu()
elif "Tensor" in str(type(tensor)) and gpu_id != -1:
return tensor.cuda(gpu_id)
elif "Tensor" in str(type(tensor)):
return tensor
elif type(tensor) is np.ndarray:
return cudavec(tensor, gpu_id=gpu_id).data
else:
return tensor
def print_gradient(x, name):
s = "Gradient of " + name + " ----------------------------------"
x.register_hook(lambda y: print(s, y.squeeze()))