Initial commit
This commit is contained in:
commit
b5f3b728c3
53 changed files with 7008 additions and 0 deletions
0
core/model/.gitkeep
Normal file
0
core/model/.gitkeep
Normal file
80
core/model/C3D.py
Normal file
80
core/model/C3D.py
Normal file
|
@ -0,0 +1,80 @@
|
|||
"""
|
||||
from https://github.com/DavideA/c3d-pytorch/blob/master/C3D_model.py
|
||||
"""
|
||||
|
||||
|
||||
import torch.nn as nn
|
||||
|
||||
|
||||
class C3D(nn.Module):
|
||||
"""
|
||||
The C3D network as described in [1].
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
super(C3D, self).__init__()
|
||||
|
||||
self.conv1 = nn.Conv3d(3, 64, kernel_size=(3, 3, 3), padding=(1, 1, 1))
|
||||
self.pool1 = nn.MaxPool3d(kernel_size=(1, 2, 2), stride=(1, 2, 2))
|
||||
|
||||
self.conv2 = nn.Conv3d(64, 128, kernel_size=(3, 3, 3), padding=(1, 1, 1))
|
||||
self.pool2 = nn.MaxPool3d(kernel_size=(2, 2, 2), stride=(2, 2, 2))
|
||||
|
||||
self.conv3a = nn.Conv3d(128, 256, kernel_size=(3, 3, 3), padding=(1, 1, 1))
|
||||
self.conv3b = nn.Conv3d(256, 256, kernel_size=(3, 3, 3), padding=(1, 1, 1))
|
||||
self.pool3 = nn.MaxPool3d(kernel_size=(2, 2, 2), stride=(2, 2, 2))
|
||||
|
||||
self.conv4a = nn.Conv3d(256, 512, kernel_size=(3, 3, 3), padding=(1, 1, 1))
|
||||
self.conv4b = nn.Conv3d(512, 512, kernel_size=(3, 3, 3), padding=(1, 1, 1))
|
||||
self.pool4 = nn.MaxPool3d(kernel_size=(2, 2, 2), stride=(2, 2, 2))
|
||||
|
||||
self.conv5a = nn.Conv3d(512, 512, kernel_size=(3, 3, 3), padding=(1, 1, 1))
|
||||
self.conv5b = nn.Conv3d(512, 512, kernel_size=(3, 3, 3), padding=(1, 1, 1))
|
||||
self.pool5 = nn.MaxPool3d(kernel_size=(2, 2, 2), stride=(2, 2, 2), padding=(0, 1, 1))
|
||||
|
||||
self.fc6 = nn.Linear(8192, 4096)
|
||||
self.fc7 = nn.Linear(4096, 4096)
|
||||
self.fc8 = nn.Linear(4096, 487)
|
||||
|
||||
self.dropout = nn.Dropout(p=0.5)
|
||||
|
||||
self.relu = nn.ReLU()
|
||||
self.softmax = nn.Softmax()
|
||||
|
||||
def forward(self, x):
|
||||
|
||||
h = self.relu(self.conv1(x))
|
||||
h = self.pool1(h)
|
||||
|
||||
h = self.relu(self.conv2(h))
|
||||
h = self.pool2(h)
|
||||
|
||||
h = self.relu(self.conv3a(h))
|
||||
h = self.relu(self.conv3b(h))
|
||||
h = self.pool3(h)
|
||||
|
||||
h = self.relu(self.conv4a(h))
|
||||
h = self.relu(self.conv4b(h))
|
||||
h = self.pool4(h)
|
||||
|
||||
h = self.relu(self.conv5a(h))
|
||||
h = self.relu(self.conv5b(h))
|
||||
h = self.pool5(h)
|
||||
|
||||
h = h.view(-1, 8192)
|
||||
h = self.relu(self.fc6(h))
|
||||
h = self.dropout(h)
|
||||
h = self.relu(self.fc7(h))
|
||||
# h = self.dropout(h)
|
||||
|
||||
# logits = self.fc8(h)
|
||||
# probs = self.softmax(logits)
|
||||
|
||||
return h
|
||||
|
||||
"""
|
||||
References
|
||||
----------
|
||||
[1] Tran, Du, et al. "Learning spatiotemporal features with 3d convolutional networks."
|
||||
Proceedings of the IEEE international conference on computer vision. 2015.
|
||||
"""
|
323
core/model/dnc.py
Normal file
323
core/model/dnc.py
Normal file
|
@ -0,0 +1,323 @@
|
|||
"""
|
||||
PyTorch DNC implementation from
|
||||
-->
|
||||
https://github.com/ixaxaar/pytorch-dnc
|
||||
<--
|
||||
"""
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
|
||||
import torch.nn as nn
|
||||
import torch as T
|
||||
from torch.autograd import Variable as var
|
||||
import numpy as np
|
||||
|
||||
from torch.nn.utils.rnn import pad_packed_sequence as pad
|
||||
from torch.nn.utils.rnn import pack_padded_sequence as pack
|
||||
from torch.nn.utils.rnn import PackedSequence
|
||||
|
||||
from .util import *
|
||||
from .memory import *
|
||||
|
||||
from torch.nn.init import orthogonal_, xavier_uniform_
|
||||
|
||||
|
||||
class DNC(nn.Module):
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
input_size,
|
||||
hidden_size,
|
||||
rnn_type='lstm',
|
||||
num_layers=1,
|
||||
num_hidden_layers=2,
|
||||
bias=True,
|
||||
batch_first=True,
|
||||
dropout=0,
|
||||
bidirectional=False,
|
||||
nr_cells=5,
|
||||
read_heads=2,
|
||||
cell_size=10,
|
||||
nonlinearity='tanh',
|
||||
gpu_id=-1,
|
||||
independent_linears=False,
|
||||
share_memory=True,
|
||||
debug=False,
|
||||
clip=20
|
||||
):
|
||||
super(DNC, self).__init__()
|
||||
# todo: separate weights and RNNs for the interface and output vectors
|
||||
|
||||
self.input_size = input_size
|
||||
self.hidden_size = hidden_size
|
||||
self.rnn_type = rnn_type
|
||||
self.num_layers = num_layers
|
||||
self.num_hidden_layers = num_hidden_layers
|
||||
self.bias = bias
|
||||
self.batch_first = batch_first
|
||||
self.dropout = dropout
|
||||
self.bidirectional = bidirectional
|
||||
self.nr_cells = nr_cells
|
||||
self.read_heads = read_heads
|
||||
self.cell_size = cell_size
|
||||
self.nonlinearity = nonlinearity
|
||||
self.gpu_id = gpu_id
|
||||
self.independent_linears = independent_linears
|
||||
self.share_memory = share_memory
|
||||
self.debug = debug
|
||||
self.clip = clip
|
||||
|
||||
self.w = self.cell_size
|
||||
self.r = self.read_heads
|
||||
|
||||
self.read_vectors_size = self.r * self.w
|
||||
self.output_size = self.hidden_size
|
||||
|
||||
self.nn_input_size = self.input_size + self.read_vectors_size
|
||||
self.nn_output_size = self.output_size + self.read_vectors_size
|
||||
|
||||
self.rnns = []
|
||||
self.memories = []
|
||||
|
||||
for layer in range(self.num_layers):
|
||||
if self.rnn_type.lower() == 'rnn':
|
||||
self.rnns.append(nn.RNN((self.nn_input_size if layer == 0 else self.nn_output_size), self.output_size,
|
||||
bias=self.bias, nonlinearity=self.nonlinearity, batch_first=True, dropout=self.dropout, num_layers=self.num_hidden_layers))
|
||||
elif self.rnn_type.lower() == 'gru':
|
||||
self.rnns.append(nn.GRU((self.nn_input_size if layer == 0 else self.nn_output_size),
|
||||
self.output_size, bias=self.bias, batch_first=True, dropout=self.dropout, num_layers=self.num_hidden_layers))
|
||||
if self.rnn_type.lower() == 'lstm':
|
||||
self.rnns.append(nn.LSTM((self.nn_input_size if layer == 0 else self.nn_output_size),
|
||||
self.output_size, bias=self.bias, batch_first=True, dropout=self.dropout, num_layers=self.num_hidden_layers))
|
||||
setattr(self, self.rnn_type.lower() + '_layer_' + str(layer), self.rnns[layer])
|
||||
|
||||
# memories for each layer
|
||||
if not self.share_memory:
|
||||
self.memories.append(
|
||||
Memory(
|
||||
input_size=self.output_size,
|
||||
mem_size=self.nr_cells,
|
||||
cell_size=self.w,
|
||||
read_heads=self.r,
|
||||
gpu_id=self.gpu_id,
|
||||
independent_linears=self.independent_linears
|
||||
)
|
||||
)
|
||||
setattr(self, 'rnn_layer_memory_' + str(layer), self.memories[layer])
|
||||
|
||||
# only one memory shared by all layers
|
||||
if self.share_memory:
|
||||
self.memories.append(
|
||||
Memory(
|
||||
input_size=self.output_size,
|
||||
mem_size=self.nr_cells,
|
||||
cell_size=self.w,
|
||||
read_heads=self.r,
|
||||
gpu_id=self.gpu_id,
|
||||
independent_linears=self.independent_linears
|
||||
)
|
||||
)
|
||||
setattr(self, 'rnn_layer_memory_shared', self.memories[0])
|
||||
|
||||
# final output layer
|
||||
self.output = nn.Linear(self.nn_output_size, self.output_size)
|
||||
orthogonal_(self.output.weight)
|
||||
|
||||
if self.gpu_id != -1:
|
||||
[x.cuda(self.gpu_id) for x in self.rnns]
|
||||
[x.cuda(self.gpu_id) for x in self.memories]
|
||||
self.output.cuda()
|
||||
|
||||
def _init_hidden(self, hx, batch_size, reset_experience):
|
||||
# create empty hidden states if not provided
|
||||
if hx is None:
|
||||
hx = (None, None, None)
|
||||
(chx, mhx, last_read) = hx
|
||||
|
||||
# initialize hidden state of the controller RNN
|
||||
if chx is None:
|
||||
h = cuda(T.zeros(self.num_hidden_layers, batch_size, self.output_size), gpu_id=self.gpu_id)
|
||||
xavier_uniform_(h)
|
||||
|
||||
chx = [ (h, h) if self.rnn_type.lower() == 'lstm' else h for x in range(self.num_layers)]
|
||||
|
||||
# Last read vectors
|
||||
if last_read is None:
|
||||
last_read = cuda(T.zeros(batch_size, self.w * self.r), gpu_id=self.gpu_id)
|
||||
|
||||
# memory states
|
||||
if mhx is None:
|
||||
if self.share_memory:
|
||||
mhx = self.memories[0].reset(batch_size, erase=reset_experience)
|
||||
else:
|
||||
mhx = [m.reset(batch_size, erase=reset_experience) for m in self.memories]
|
||||
else:
|
||||
if self.share_memory:
|
||||
mhx = self.memories[0].reset(batch_size, mhx, erase=reset_experience)
|
||||
else:
|
||||
mhx = [m.reset(batch_size, h, erase=reset_experience) for m, h in zip(self.memories, mhx)]
|
||||
|
||||
return chx, mhx, last_read
|
||||
|
||||
def _debug(self, mhx, debug_obj):
|
||||
if not debug_obj:
|
||||
debug_obj = {
|
||||
'memory': [],
|
||||
'link_matrix': [],
|
||||
'precedence': [],
|
||||
'read_weights': [],
|
||||
'write_weights': [],
|
||||
'usage_vector': [],
|
||||
}
|
||||
|
||||
debug_obj['memory'].append(mhx['memory'][0].data.cpu().numpy())
|
||||
debug_obj['link_matrix'].append(mhx['link_matrix'][0][0].data.cpu().numpy())
|
||||
debug_obj['precedence'].append(mhx['precedence'][0].data.cpu().numpy())
|
||||
debug_obj['read_weights'].append(mhx['read_weights'][0].data.cpu().numpy())
|
||||
debug_obj['write_weights'].append(mhx['write_weights'][0].data.cpu().numpy())
|
||||
debug_obj['usage_vector'].append(mhx['usage_vector'][0].unsqueeze(0).data.cpu().numpy())
|
||||
return debug_obj
|
||||
|
||||
def _layer_forward(self, input, layer, hx=(None, None), pass_through_memory=True):
|
||||
(chx, mhx) = hx
|
||||
|
||||
# pass through the controller layer
|
||||
input, chx = self.rnns[layer](input.unsqueeze(1), chx)
|
||||
input = input.squeeze(1)
|
||||
|
||||
# clip the controller output
|
||||
if self.clip != 0:
|
||||
output = T.clamp(input, -self.clip, self.clip)
|
||||
else:
|
||||
output = input
|
||||
|
||||
# the interface vector
|
||||
ξ = output
|
||||
|
||||
# pass through memory
|
||||
if pass_through_memory:
|
||||
if self.share_memory:
|
||||
read_vecs, mhx = self.memories[0](ξ, mhx)
|
||||
else:
|
||||
read_vecs, mhx = self.memories[layer](ξ, mhx)
|
||||
# the read vectors
|
||||
read_vectors = read_vecs.view(-1, self.w * self.r)
|
||||
else:
|
||||
read_vectors = None
|
||||
|
||||
return output, (chx, mhx, read_vectors)
|
||||
|
||||
def forward(self, input, hx=(None, None, None), reset_experience=False, pass_through_memory=True):
|
||||
# handle packed data
|
||||
is_packed = type(input) is PackedSequence
|
||||
if is_packed:
|
||||
input, lengths = pad(input)
|
||||
max_length = lengths[0]
|
||||
else:
|
||||
max_length = input.size(1) if self.batch_first else input.size(0)
|
||||
lengths = [input.size(1)] * max_length if self.batch_first else [input.size(0)] * max_length
|
||||
|
||||
batch_size = input.size(0) if self.batch_first else input.size(1)
|
||||
|
||||
if not self.batch_first:
|
||||
input = input.transpose(0, 1)
|
||||
# make the data time-first
|
||||
|
||||
controller_hidden, mem_hidden, last_read = self._init_hidden(hx, batch_size, reset_experience)
|
||||
|
||||
# concat input with last read (or padding) vectors
|
||||
inputs = [T.cat([input[:, x, :], last_read], 1) for x in range(max_length)]
|
||||
|
||||
# batched forward pass per element / word / etc
|
||||
if self.debug:
|
||||
viz = None
|
||||
|
||||
outs = [None] * max_length
|
||||
read_vectors = None
|
||||
rv = [None] * max_length
|
||||
# pass through time
|
||||
for time in range(max_length):
|
||||
# pass thorugh layers
|
||||
for layer in range(self.num_layers):
|
||||
# this layer's hidden states
|
||||
chx = controller_hidden[layer]
|
||||
m = mem_hidden if self.share_memory else mem_hidden[layer]
|
||||
# pass through controller
|
||||
outs[time], (chx, m, read_vectors) = \
|
||||
self._layer_forward(inputs[time], layer, (chx, m), pass_through_memory)
|
||||
|
||||
# debug memory
|
||||
if self.debug:
|
||||
viz = self._debug(m, viz)
|
||||
|
||||
# store the memory back (per layer or shared)
|
||||
if self.share_memory:
|
||||
mem_hidden = m
|
||||
else:
|
||||
mem_hidden[layer] = m
|
||||
controller_hidden[layer] = chx
|
||||
|
||||
if read_vectors is not None:
|
||||
# the controller output + read vectors go into next layer
|
||||
outs[time] = T.cat([outs[time], read_vectors], 1)
|
||||
if layer == self.num_layers - 1:
|
||||
rv[time] = read_vectors.reshape(batch_size, self.r, self.w)
|
||||
else:
|
||||
outs[time] = T.cat([outs[time], last_read], 1)
|
||||
inputs[time] = outs[time]
|
||||
|
||||
if self.debug:
|
||||
viz = {k: np.array(v) for k, v in viz.items()}
|
||||
viz = {k: v.reshape(v.shape[0], v.shape[1] * v.shape[2]) for k, v in viz.items()}
|
||||
|
||||
# pass through final output layer
|
||||
inputs = [self.output(i) for i in inputs]
|
||||
outputs = T.stack(inputs, 1 if self.batch_first else 0)
|
||||
|
||||
if is_packed:
|
||||
outputs = pack(output, lengths)
|
||||
|
||||
if self.debug:
|
||||
return outputs, (controller_hidden, mem_hidden, read_vectors), rv, viz
|
||||
else:
|
||||
return outputs, (controller_hidden, mem_hidden, read_vectors), rv
|
||||
|
||||
def __repr__(self):
|
||||
s = "\n----------------------------------------\n"
|
||||
s += '{name}({input_size}, {hidden_size}'
|
||||
if self.rnn_type != 'lstm':
|
||||
s += ', rnn_type={rnn_type}'
|
||||
if self.num_layers != 1:
|
||||
s += ', num_layers={num_layers}'
|
||||
if self.num_hidden_layers != 2:
|
||||
s += ', num_hidden_layers={num_hidden_layers}'
|
||||
if self.bias != True:
|
||||
s += ', bias={bias}'
|
||||
if self.batch_first != True:
|
||||
s += ', batch_first={batch_first}'
|
||||
if self.dropout != 0:
|
||||
s += ', dropout={dropout}'
|
||||
if self.bidirectional != False:
|
||||
s += ', bidirectional={bidirectional}'
|
||||
if self.nr_cells != 5:
|
||||
s += ', nr_cells={nr_cells}'
|
||||
if self.read_heads != 2:
|
||||
s += ', read_heads={read_heads}'
|
||||
if self.cell_size != 10:
|
||||
s += ', cell_size={cell_size}'
|
||||
if self.nonlinearity != 'tanh':
|
||||
s += ', nonlinearity={nonlinearity}'
|
||||
if self.gpu_id != -1:
|
||||
s += ', gpu_id={gpu_id}'
|
||||
if self.independent_linears != False:
|
||||
s += ', independent_linears={independent_linears}'
|
||||
if self.share_memory != True:
|
||||
s += ', share_memory={share_memory}'
|
||||
if self.debug != False:
|
||||
s += ', debug={debug}'
|
||||
if self.clip != 20:
|
||||
s += ', clip={clip}'
|
||||
|
||||
s += ")\n" + super(DNC, self).__repr__() + \
|
||||
"\n----------------------------------------\n"
|
||||
return s.format(name=self.__class__.__name__, **self.__dict__)
|
208
core/model/mca.py
Normal file
208
core/model/mca.py
Normal file
|
@ -0,0 +1,208 @@
|
|||
# --------------------------------------------------------
|
||||
# mcan-vqa (Deep Modular Co-Attention Networks)
|
||||
# Licensed under The MIT License [see LICENSE for details]
|
||||
# Written by Yuhao Cui https://github.com/cuiyuhao1996
|
||||
# --------------------------------------------------------
|
||||
|
||||
from core.model.net_utils import FC, MLP, LayerNorm
|
||||
from core.model.dnc_improved import DNC, SharedMemDNC
|
||||
from core.model.dnc_improved import FeedforwardController
|
||||
import torch.nn as nn
|
||||
import torch.nn.functional as F
|
||||
import torch, math
|
||||
import time
|
||||
|
||||
|
||||
# ------------------------------
|
||||
# ---- Multi-Head Attention ----
|
||||
# ------------------------------
|
||||
|
||||
class MHAtt(nn.Module):
|
||||
def __init__(self, __C):
|
||||
super(MHAtt, self).__init__()
|
||||
self.__C = __C
|
||||
|
||||
self.linear_v = nn.Linear(__C.HIDDEN_SIZE, __C.HIDDEN_SIZE)
|
||||
self.linear_k = nn.Linear(__C.HIDDEN_SIZE, __C.HIDDEN_SIZE)
|
||||
self.linear_q = nn.Linear(__C.HIDDEN_SIZE, __C.HIDDEN_SIZE)
|
||||
self.linear_merge = nn.Linear(__C.HIDDEN_SIZE, __C.HIDDEN_SIZE)
|
||||
|
||||
self.dropout = nn.Dropout(__C.DROPOUT_R)
|
||||
|
||||
def forward(self, v, k, q, mask):
|
||||
n_batches = q.size(0)
|
||||
|
||||
v = self.linear_v(v).view(
|
||||
n_batches,
|
||||
-1,
|
||||
self.__C.MULTI_HEAD,
|
||||
self.__C.HIDDEN_SIZE_HEAD
|
||||
).transpose(1, 2)
|
||||
|
||||
k = self.linear_k(k).view(
|
||||
n_batches,
|
||||
-1,
|
||||
self.__C.MULTI_HEAD,
|
||||
self.__C.HIDDEN_SIZE_HEAD
|
||||
).transpose(1, 2)
|
||||
|
||||
q = self.linear_q(q).view(
|
||||
n_batches,
|
||||
-1,
|
||||
self.__C.MULTI_HEAD,
|
||||
self.__C.HIDDEN_SIZE_HEAD
|
||||
).transpose(1, 2)
|
||||
|
||||
atted = self.att(v, k, q, mask)
|
||||
atted = atted.transpose(1, 2).contiguous().view(
|
||||
n_batches,
|
||||
-1,
|
||||
self.__C.HIDDEN_SIZE
|
||||
)
|
||||
|
||||
atted = self.linear_merge(atted)
|
||||
|
||||
return atted
|
||||
|
||||
def att(self, value, key, query, mask):
|
||||
d_k = query.size(-1)
|
||||
|
||||
scores = torch.matmul(
|
||||
query, key.transpose(-2, -1)
|
||||
) / math.sqrt(d_k)
|
||||
|
||||
if mask is not None:
|
||||
scores = scores.masked_fill(mask, -1e9)
|
||||
|
||||
att_map = F.softmax(scores, dim=-1)
|
||||
att_map = self.dropout(att_map)
|
||||
|
||||
return torch.matmul(att_map, value)
|
||||
|
||||
|
||||
|
||||
# ---------------------------
|
||||
# ---- Feed Forward Nets ----
|
||||
# ---------------------------
|
||||
|
||||
class FFN(nn.Module):
|
||||
def __init__(self, __C):
|
||||
super(FFN, self).__init__()
|
||||
|
||||
self.mlp = MLP(
|
||||
in_size=__C.HIDDEN_SIZE,
|
||||
mid_size=__C.FF_SIZE,
|
||||
out_size=__C.HIDDEN_SIZE,
|
||||
dropout_r=__C.DROPOUT_R,
|
||||
use_relu=True
|
||||
)
|
||||
|
||||
def forward(self, x):
|
||||
return self.mlp(x)
|
||||
|
||||
|
||||
# ------------------------
|
||||
# ---- Self Attention ----
|
||||
# ------------------------
|
||||
|
||||
class SA(nn.Module):
|
||||
def __init__(self, __C):
|
||||
super(SA, self).__init__()
|
||||
self.mhatt = MHAtt(__C)
|
||||
self.ffn = FFN(__C)
|
||||
|
||||
self.dropout1 = nn.Dropout(__C.DROPOUT_R)
|
||||
self.norm1 = LayerNorm(__C.HIDDEN_SIZE)
|
||||
|
||||
self.dropout2 = nn.Dropout(__C.DROPOUT_R)
|
||||
self.norm2 = LayerNorm(__C.HIDDEN_SIZE)
|
||||
|
||||
def forward(self, x, x_mask):
|
||||
x = self.norm1(x + self.dropout1(
|
||||
self.mhatt(x, x, x, x_mask)
|
||||
))
|
||||
|
||||
x = self.norm2(x + self.dropout2(
|
||||
self.ffn(x)
|
||||
))
|
||||
|
||||
return x
|
||||
|
||||
# -------------------------------
|
||||
# ---- Self Guided Attention ----
|
||||
# -------------------------------
|
||||
|
||||
class SGA(nn.Module):
|
||||
def __init__(self, __C):
|
||||
super(SGA, self).__init__()
|
||||
|
||||
self.mhatt1 = MHAtt(__C)
|
||||
self.mhatt2 = MHAtt(__C)
|
||||
self.ffn = FFN(__C)
|
||||
|
||||
self.dropout1 = nn.Dropout(__C.DROPOUT_R)
|
||||
self.norm1 = LayerNorm(__C.HIDDEN_SIZE)
|
||||
|
||||
self.dropout2 = nn.Dropout(__C.DROPOUT_R)
|
||||
self.norm2 = LayerNorm(__C.HIDDEN_SIZE)
|
||||
|
||||
self.dropout3 = nn.Dropout(__C.DROPOUT_R)
|
||||
self.norm3 = LayerNorm(__C.HIDDEN_SIZE)
|
||||
|
||||
def forward(self, x, y, x_mask, y_mask):
|
||||
x = self.norm1(x + self.dropout1(
|
||||
self.mhatt1(x, x, x, x_mask)
|
||||
))
|
||||
|
||||
x = self.norm2(x + self.dropout2(
|
||||
self.mhatt2(y, y, x, y_mask)
|
||||
))
|
||||
|
||||
x = self.norm3(x + self.dropout3(
|
||||
self.ffn(x)
|
||||
))
|
||||
|
||||
return x
|
||||
|
||||
|
||||
# ------------------------------------------------
|
||||
# ---- MAC Layers Cascaded by Encoder-Decoder ----
|
||||
# ------------------------------------------------
|
||||
|
||||
class MCA_ED(nn.Module):
|
||||
def __init__(self, __C):
|
||||
super(MCA_ED, self).__init__()
|
||||
|
||||
self.enc_list = nn.ModuleList([SA(__C) for _ in range(__C.LAYER)])
|
||||
self.dec_list = nn.ModuleList([SGA(__C) for _ in range(__C.LAYER)])
|
||||
|
||||
def forward(self, x, y, x_mask, y_mask):
|
||||
# Get hidden vector
|
||||
for enc in self.enc_list:
|
||||
x = enc(x, x_mask)
|
||||
|
||||
for dec in self.dec_list:
|
||||
y = dec(y, x, y_mask, x_mask)
|
||||
return x, y
|
||||
|
||||
class VLC(nn.Module):
|
||||
def __init__(self, __C):
|
||||
super(VLC, self).__init__()
|
||||
|
||||
self.enc_list = nn.ModuleList([SA(__C) for _ in range(__C.LAYER)])
|
||||
self.dec_lang_frames_list = nn.ModuleList([SGA(__C) for _ in range(__C.LAYER)])
|
||||
self.dec_lang_clips_list = nn.ModuleList([SGA(__C) for _ in range(__C.LAYER)])
|
||||
|
||||
|
||||
def forward(self, x, y, z, x_mask, y_mask, z_mask):
|
||||
# Get hidden vector
|
||||
for enc in self.enc_list:
|
||||
x = enc(x, x_mask)
|
||||
|
||||
for dec in self.dec_lang_frames_list:
|
||||
y = dec(y, x, y_mask, x_mask)
|
||||
|
||||
for dec in self.dec_lang_clips_list:
|
||||
z = dec(z, x, z_mask, x_mask)
|
||||
return x, y, z
|
||||
|
314
core/model/memory.py
Normal file
314
core/model/memory.py
Normal file
|
@ -0,0 +1,314 @@
|
|||
"""
|
||||
PyTorch DNC implementation from
|
||||
-->
|
||||
https://github.com/ixaxaar/pytorch-dnc
|
||||
<--
|
||||
"""
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
import torch.nn as nn
|
||||
import torch as T
|
||||
from torch.autograd import Variable as var
|
||||
import torch.nn.functional as F
|
||||
import numpy as np
|
||||
|
||||
from core.model.util import *
|
||||
|
||||
|
||||
class Memory(nn.Module):
|
||||
|
||||
def __init__(self, input_size, mem_size=512, cell_size=32, read_heads=4, gpu_id=-1, independent_linears=True):
|
||||
super(Memory, self).__init__()
|
||||
|
||||
self.input_size = input_size
|
||||
self.mem_size = mem_size
|
||||
self.cell_size = cell_size
|
||||
self.read_heads = read_heads
|
||||
self.gpu_id = gpu_id
|
||||
self.independent_linears = independent_linears
|
||||
|
||||
m = self.mem_size
|
||||
w = self.cell_size
|
||||
r = self.read_heads
|
||||
|
||||
if self.independent_linears:
|
||||
self.read_keys_transform = nn.Linear(self.input_size, w * r)
|
||||
self.read_strengths_transform = nn.Linear(self.input_size, r)
|
||||
self.write_key_transform = nn.Linear(self.input_size, w)
|
||||
self.write_strength_transform = nn.Linear(self.input_size, 1)
|
||||
self.erase_vector_transform = nn.Linear(self.input_size, w)
|
||||
self.write_vector_transform = nn.Linear(self.input_size, w)
|
||||
self.free_gates_transform = nn.Linear(self.input_size, r)
|
||||
self.allocation_gate_transform = nn.Linear(self.input_size, 1)
|
||||
self.write_gate_transform = nn.Linear(self.input_size, 1)
|
||||
self.read_modes_transform = nn.Linear(self.input_size, 3 * r)
|
||||
else:
|
||||
self.interface_size = (w * r) + (3 * w) + (5 * r) + 3
|
||||
self.interface_weights = nn.Linear(
|
||||
self.input_size, self.interface_size)
|
||||
|
||||
self.I = cuda(1 - T.eye(m).unsqueeze(0),
|
||||
gpu_id=self.gpu_id) # (1 * n * n)
|
||||
|
||||
def reset(self, batch_size=1, hidden=None, erase=True):
|
||||
m = self.mem_size
|
||||
w = self.cell_size
|
||||
r = self.read_heads
|
||||
b = batch_size
|
||||
|
||||
if hidden is None:
|
||||
return {
|
||||
'memory': cuda(T.zeros(b, m, w).fill_(0), gpu_id=self.gpu_id),
|
||||
'link_matrix': cuda(T.zeros(b, 1, m, m), gpu_id=self.gpu_id),
|
||||
'precedence': cuda(T.zeros(b, 1, m), gpu_id=self.gpu_id),
|
||||
'read_weights': cuda(T.zeros(b, r, m).fill_(0), gpu_id=self.gpu_id),
|
||||
'write_weights': cuda(T.zeros(b, 1, m).fill_(0), gpu_id=self.gpu_id),
|
||||
'usage_vector': cuda(T.zeros(b, m), gpu_id=self.gpu_id),
|
||||
# 'free_gates': cuda(T.zeros(b, r), gpu_id=self.gpu_id),
|
||||
# 'alloc_gates': cuda(T.zeros(b, 1), gpu_id=self.gpu_id),
|
||||
# 'write_gates': cuda(T.zeros(b, 1), gpu_id=self.gpu_id),
|
||||
# 'read_modes': cuda(T.zeros(b, r, 3), gpu_id=self.gpu_id)
|
||||
}
|
||||
else:
|
||||
hidden['memory'] = hidden['memory'].clone()
|
||||
hidden['link_matrix'] = hidden['link_matrix'].clone()
|
||||
hidden['precedence'] = hidden['precedence'].clone()
|
||||
hidden['read_weights'] = hidden['read_weights'].clone()
|
||||
hidden['write_weights'] = hidden['write_weights'].clone()
|
||||
hidden['usage_vector'] = hidden['usage_vector'].clone()
|
||||
# hidden['free_gates'] = hidden['free_gates'].clone()
|
||||
# hidden['alloc_gates'] = hidden['alloc_gates'].clone()
|
||||
# hidden['write_gates'] = hidden['write_gates'].clone()
|
||||
# hidden['read_modes'] = hidden['read_modes'].clone()
|
||||
|
||||
if erase:
|
||||
hidden['memory'].data.fill_(0)
|
||||
hidden['link_matrix'].data.zero_()
|
||||
hidden['precedence'].data.zero_()
|
||||
hidden['read_weights'].data.fill_(0)
|
||||
hidden['write_weights'].data.fill_(0)
|
||||
hidden['usage_vector'].data.zero_()
|
||||
# hidden['free_gates'].data.fill_()
|
||||
# hidden['alloc_gates'].data.fill_()
|
||||
# hidden['write_gates'].data.fill_()
|
||||
# hidden['read_modes'].data.fill_()
|
||||
|
||||
return hidden
|
||||
|
||||
def get_usage_vector(self, usage, free_gates, read_weights, write_weights):
|
||||
# write_weights = write_weights.detach() # detach from the computation graph
|
||||
# if read_weights.size(0) > free_gates.size(0):
|
||||
# read_weights = read_weights[:free_gates.size(0), :, :]
|
||||
# if usage.size(0) > free_gates.size(0):
|
||||
# usage = usage[:free_gates.size(0), :]
|
||||
# if write_weights.size(0) > free_gates.size(0):
|
||||
# write_weights = write_weights[:free_gates.size(0), :, :]
|
||||
usage = usage + (1 - usage) * (1 - T.prod(1 - write_weights, 1))
|
||||
ψ = T.prod(1 - free_gates.unsqueeze(2) * read_weights, 1)
|
||||
return usage * ψ
|
||||
|
||||
def allocate(self, usage, write_gate):
|
||||
# ensure values are not too small prior to cumprod.
|
||||
usage = δ + (1 - δ) * usage
|
||||
batch_size = usage.size(0)
|
||||
# free list
|
||||
sorted_usage, φ = T.topk(usage, self.mem_size, dim=1, largest=False)
|
||||
|
||||
# cumprod with exclusive=True
|
||||
# https://discuss.pytorch.org/t/cumprod-exclusive-true-equivalences/2614/8
|
||||
v = var(sorted_usage.data.new(batch_size, 1).fill_(1))
|
||||
cat_sorted_usage = T.cat((v, sorted_usage), 1)
|
||||
prod_sorted_usage = T.cumprod(cat_sorted_usage, 1)[:, :-1]
|
||||
|
||||
sorted_allocation_weights = (1 - sorted_usage) * prod_sorted_usage.squeeze()
|
||||
|
||||
# construct the reverse sorting index https://stackoverflow.com/questions/2483696/undo-or-reverse-argsort-python
|
||||
_, φ_rev = T.topk(φ, k=self.mem_size, dim=1, largest=False)
|
||||
allocation_weights = sorted_allocation_weights.gather(1, φ_rev.long())
|
||||
|
||||
return allocation_weights.unsqueeze(1), usage
|
||||
|
||||
def write_weighting(self, memory, write_content_weights, allocation_weights, write_gate, allocation_gate):
|
||||
ag = allocation_gate.unsqueeze(-1)
|
||||
wg = write_gate.unsqueeze(-1)
|
||||
|
||||
return wg * (ag * allocation_weights + (1 - ag) * write_content_weights)
|
||||
|
||||
def get_link_matrix(self, link_matrix, write_weights, precedence):
|
||||
precedence = precedence.unsqueeze(2)
|
||||
write_weights_i = write_weights.unsqueeze(3)
|
||||
write_weights_j = write_weights.unsqueeze(2)
|
||||
|
||||
prev_scale = 1 - write_weights_i - write_weights_j
|
||||
new_link_matrix = write_weights_i * precedence
|
||||
|
||||
link_matrix = prev_scale * link_matrix + new_link_matrix
|
||||
# trick to delete diag elems
|
||||
return self.I.expand_as(link_matrix) * link_matrix
|
||||
|
||||
def update_precedence(self, precedence, write_weights):
|
||||
return (1 - T.sum(write_weights, 2, keepdim=True)) * precedence + write_weights
|
||||
|
||||
def write(self, write_key, write_vector, erase_vector, free_gates, read_strengths, write_strength, write_gate, allocation_gate, hidden):
|
||||
# get current usage
|
||||
hidden['usage_vector'] = self.get_usage_vector(
|
||||
hidden['usage_vector'],
|
||||
free_gates,
|
||||
hidden['read_weights'],
|
||||
hidden['write_weights']
|
||||
)
|
||||
|
||||
# lookup memory with write_key and write_strength
|
||||
write_content_weights = self.content_weightings(
|
||||
hidden['memory'], write_key, write_strength)
|
||||
|
||||
# get memory allocation
|
||||
alloc, _ = self.allocate(
|
||||
hidden['usage_vector'],
|
||||
allocation_gate * write_gate
|
||||
)
|
||||
|
||||
# get write weightings
|
||||
hidden['write_weights'] = self.write_weighting(
|
||||
hidden['memory'],
|
||||
write_content_weights,
|
||||
alloc,
|
||||
write_gate,
|
||||
allocation_gate
|
||||
)
|
||||
|
||||
weighted_resets = hidden['write_weights'].unsqueeze(
|
||||
3) * erase_vector.unsqueeze(2)
|
||||
reset_gate = T.prod(1 - weighted_resets, 1)
|
||||
# Update memory
|
||||
hidden['memory'] = hidden['memory'] * reset_gate
|
||||
|
||||
hidden['memory'] = hidden['memory'] + \
|
||||
T.bmm(hidden['write_weights'].transpose(1, 2), write_vector)
|
||||
|
||||
# update link_matrix
|
||||
hidden['link_matrix'] = self.get_link_matrix(
|
||||
hidden['link_matrix'],
|
||||
hidden['write_weights'],
|
||||
hidden['precedence']
|
||||
)
|
||||
hidden['precedence'] = self.update_precedence(
|
||||
hidden['precedence'], hidden['write_weights'])
|
||||
|
||||
return hidden
|
||||
|
||||
def content_weightings(self, memory, keys, strengths):
|
||||
# if memory.size(0) > keys.size(0):
|
||||
# memory = memory[:keys.size(0), :, :]
|
||||
d = θ(memory, keys)
|
||||
return σ(d * strengths.unsqueeze(2), 2)
|
||||
|
||||
def directional_weightings(self, link_matrix, read_weights):
|
||||
rw = read_weights.unsqueeze(1)
|
||||
|
||||
f = T.matmul(link_matrix, rw.transpose(2, 3)).transpose(2, 3)
|
||||
b = T.matmul(rw, link_matrix)
|
||||
return f.transpose(1, 2), b.transpose(1, 2)
|
||||
|
||||
def read_weightings(self, memory, content_weights, link_matrix, read_modes, read_weights):
|
||||
forward_weight, backward_weight = self.directional_weightings(
|
||||
link_matrix, read_weights)
|
||||
|
||||
content_mode = read_modes[:, :, 2].contiguous(
|
||||
).unsqueeze(2) * content_weights
|
||||
backward_mode = T.sum(
|
||||
read_modes[:, :, 0:1].contiguous().unsqueeze(3) * backward_weight, 2)
|
||||
forward_mode = T.sum(
|
||||
read_modes[:, :, 1:2].contiguous().unsqueeze(3) * forward_weight, 2)
|
||||
|
||||
return backward_mode + content_mode + forward_mode
|
||||
|
||||
def read_vectors(self, memory, read_weights):
|
||||
return T.bmm(read_weights, memory)
|
||||
|
||||
def read(self, read_keys, read_strengths, read_modes, hidden):
|
||||
content_weights = self.content_weightings(
|
||||
hidden['memory'], read_keys, read_strengths)
|
||||
|
||||
hidden['read_weights'] = self.read_weightings(
|
||||
hidden['memory'],
|
||||
content_weights,
|
||||
hidden['link_matrix'],
|
||||
read_modes,
|
||||
hidden['read_weights']
|
||||
)
|
||||
read_vectors = self.read_vectors(
|
||||
hidden['memory'], hidden['read_weights'])
|
||||
return read_vectors, hidden
|
||||
|
||||
def forward(self, ξ, hidden):
|
||||
|
||||
# ξ = ξ.detach()
|
||||
m = self.mem_size
|
||||
w = self.cell_size
|
||||
r = self.read_heads
|
||||
b = ξ.size()[0]
|
||||
|
||||
if self.independent_linears:
|
||||
# r read keys (b * r * w)
|
||||
read_keys = self.read_keys_transform(ξ).view(b, r, w)
|
||||
# r read strengths (b * r)
|
||||
read_strengths = F.softplus(
|
||||
self.read_strengths_transform(ξ).view(b, r))
|
||||
# write key (b * 1 * w)
|
||||
write_key = self.write_key_transform(ξ).view(b, 1, w)
|
||||
# write strength (b * 1)
|
||||
write_strength = F.softplus(
|
||||
self.write_strength_transform(ξ).view(b, 1))
|
||||
# erase vector (b * 1 * w)
|
||||
erase_vector = T.sigmoid(
|
||||
self.erase_vector_transform(ξ).view(b, 1, w))
|
||||
# write vector (b * 1 * w)
|
||||
write_vector = self.write_vector_transform(ξ).view(b, 1, w)
|
||||
# r free gates (b * r)
|
||||
free_gates = T.sigmoid(self.free_gates_transform(ξ).view(b, r))
|
||||
# allocation gate (b * 1)
|
||||
allocation_gate = T.sigmoid(
|
||||
self.allocation_gate_transform(ξ).view(b, 1))
|
||||
# write gate (b * 1)
|
||||
write_gate = T.sigmoid(self.write_gate_transform(ξ).view(b, 1))
|
||||
# read modes (b * r * 3)
|
||||
read_modes = σ(self.read_modes_transform(ξ).view(b, r, 3), -1)
|
||||
else:
|
||||
ξ = self.interface_weights(ξ)
|
||||
# r read keys (b * w * r)
|
||||
read_keys = ξ[:, :r * w].contiguous().view(b, r, w)
|
||||
# r read strengths (b * r)
|
||||
read_strengths = F.softplus(
|
||||
ξ[:, r * w:r * w + r].contiguous().view(b, r))
|
||||
# write key (b * w * 1)
|
||||
write_key = ξ[:, r * w + r:r * w + r + w].contiguous().view(b, 1, w)
|
||||
# write strength (b * 1)
|
||||
write_strength = F.softplus(
|
||||
ξ[:, r * w + r + w].contiguous().view(b, 1))
|
||||
# erase vector (b * w)
|
||||
erase_vector = T.sigmoid(
|
||||
ξ[:, r * w + r + w + 1: r * w + r + 2 * w + 1].contiguous().view(b, 1, w))
|
||||
# write vector (b * w)
|
||||
write_vector = ξ[:, r * w + r + 2 * w + 1: r * w + r + 3 * w + 1].contiguous().view(b, 1, w)
|
||||
# r free gates (b * r)
|
||||
free_gates = T.sigmoid(
|
||||
ξ[:, r * w + r + 3 * w + 1: r * w + 2 * r + 3 * w + 1].contiguous().view(b, r))
|
||||
# allocation gate (b * 1)
|
||||
allocation_gate = T.sigmoid(
|
||||
ξ[:, r * w + 2 * r + 3 * w + 1].contiguous().unsqueeze(1).view(b, 1))
|
||||
# write gate (b * 1)
|
||||
write_gate = T.sigmoid(
|
||||
ξ[:, r * w + 2 * r + 3 * w + 2].contiguous()).unsqueeze(1).view(b, 1)
|
||||
# read modes (b * 3*r)
|
||||
read_modes = σ(ξ[:, r * w + 2 * r + 3 * w + 3: r *
|
||||
w + 5 * r + 3 * w + 3].contiguous().view(b, r, 3), -1)
|
||||
|
||||
hidden = self.write(write_key, write_vector, erase_vector, free_gates,
|
||||
read_strengths, write_strength, write_gate, allocation_gate, hidden)
|
||||
hidden["free_gates"] = free_gates.clone().detach()
|
||||
hidden["allocation_gate"] = allocation_gate.clone().detach()
|
||||
hidden["write_gate"] = write_gate.clone().detach()
|
||||
hidden["read_modes"] = read_modes.clone().detach()
|
||||
|
||||
return self.read(read_keys, read_strengths, read_modes, hidden)
|
501
core/model/net.py
Normal file
501
core/model/net.py
Normal file
|
@ -0,0 +1,501 @@
|
|||
# --------------------------------------------------------
|
||||
# mcan-vqa (Deep Modular Co-Attention Networks)
|
||||
# Licensed under The MIT License [see LICENSE for details]
|
||||
# Written by Yuhao Cui https://github.com/cuiyuhao1996
|
||||
# --------------------------------------------------------
|
||||
|
||||
from core.model.net_utils import FC, MLP, LayerNorm
|
||||
from core.model.mca import SA, MCA_ED, VLC
|
||||
from core.model.dnc import DNC
|
||||
|
||||
import torch.nn as nn
|
||||
import torch.nn.functional as F
|
||||
import torch
|
||||
|
||||
# ------------------------------
|
||||
# ---- Flatten the sequence ----
|
||||
# ------------------------------
|
||||
|
||||
class AttFlat(nn.Module):
|
||||
def __init__(self, __C):
|
||||
super(AttFlat, self).__init__()
|
||||
self.__C = __C
|
||||
|
||||
self.mlp = MLP(
|
||||
in_size=__C.HIDDEN_SIZE,
|
||||
mid_size=__C.FLAT_MLP_SIZE,
|
||||
out_size=__C.FLAT_GLIMPSES,
|
||||
dropout_r=__C.DROPOUT_R,
|
||||
use_relu=True
|
||||
)
|
||||
|
||||
self.linear_merge = nn.Linear(
|
||||
__C.HIDDEN_SIZE * __C.FLAT_GLIMPSES,
|
||||
__C.FLAT_OUT_SIZE
|
||||
)
|
||||
|
||||
def forward(self, x, x_mask):
|
||||
att = self.mlp(x)
|
||||
att = att.masked_fill(
|
||||
x_mask.squeeze(1).squeeze(1).unsqueeze(2),
|
||||
-1e9
|
||||
)
|
||||
att = F.softmax(att, dim=1)
|
||||
|
||||
att_list = []
|
||||
for i in range(self.__C.FLAT_GLIMPSES):
|
||||
att_list.append(
|
||||
torch.sum(att[:, :, i: i + 1] * x, dim=1)
|
||||
)
|
||||
|
||||
x_atted = torch.cat(att_list, dim=1)
|
||||
x_atted = self.linear_merge(x_atted)
|
||||
|
||||
return x_atted
|
||||
|
||||
class AttFlatMem(AttFlat):
|
||||
def __init__(self, __C):
|
||||
super(AttFlatMem, self).__init__(__C)
|
||||
self.__C = __C
|
||||
|
||||
def forward(self, x_mem, x, x_mask):
|
||||
att = self.mlp(x_mem)
|
||||
att = att.masked_fill(
|
||||
x_mask.squeeze(1).squeeze(1).unsqueeze(2),
|
||||
float('-inf')
|
||||
)
|
||||
att = F.softmax(att, dim=1)
|
||||
att_list = []
|
||||
for i in range(self.__C.FLAT_GLIMPSES):
|
||||
att_list.append(
|
||||
torch.sum(att[:, :, i: i + 1] * x, dim=1)
|
||||
)
|
||||
x_atted = torch.cat(att_list, dim=1)
|
||||
x_atted = self.linear_merge(x_atted)
|
||||
|
||||
return x_atted
|
||||
# -------------------------
|
||||
# ---- Main MCAN Model ----
|
||||
# -------------------------
|
||||
|
||||
class Net1(nn.Module):
|
||||
def __init__(self, __C, pretrained_emb, token_size, answer_size):
|
||||
super(Net1, self).__init__()
|
||||
print('Training with Network type 1: VLCN')
|
||||
self.pretrained_path = __C.PRETRAINED_PATH
|
||||
self.embedding = nn.Embedding(
|
||||
num_embeddings=token_size,
|
||||
embedding_dim=__C.WORD_EMBED_SIZE
|
||||
)
|
||||
|
||||
# Loading the GloVe embedding weights
|
||||
if __C.USE_GLOVE:
|
||||
self.embedding.weight.data.copy_(torch.from_numpy(pretrained_emb))
|
||||
|
||||
self.lstm = nn.LSTM(
|
||||
input_size=__C.WORD_EMBED_SIZE,
|
||||
hidden_size=__C.HIDDEN_SIZE,
|
||||
num_layers=1,
|
||||
batch_first=True
|
||||
)
|
||||
|
||||
self.frame_feat_linear = nn.Linear(
|
||||
__C.FRAME_FEAT_SIZE,
|
||||
__C.HIDDEN_SIZE
|
||||
)
|
||||
|
||||
self.clip_feat_linear = nn.Linear(
|
||||
__C.CLIP_FEAT_SIZE,
|
||||
__C.HIDDEN_SIZE
|
||||
)
|
||||
self.backbone = VLC(__C)
|
||||
|
||||
self.attflat_lang = AttFlat(__C)
|
||||
self.attflat_frame = AttFlat(__C)
|
||||
self.attflat_clip = AttFlat(__C)
|
||||
|
||||
self.dnc = DNC(
|
||||
__C.FLAT_OUT_SIZE,
|
||||
__C.FLAT_OUT_SIZE,
|
||||
rnn_type='lstm',
|
||||
num_layers=2,
|
||||
num_hidden_layers=2,
|
||||
bias=True,
|
||||
batch_first=True,
|
||||
dropout=0,
|
||||
bidirectional=True,
|
||||
nr_cells=__C.CELL_COUNT_DNC,
|
||||
read_heads=__C.N_READ_HEADS_DNC,
|
||||
cell_size=__C.WORD_LENGTH_DNC,
|
||||
nonlinearity='tanh',
|
||||
gpu_id=0,
|
||||
independent_linears=False,
|
||||
share_memory=False,
|
||||
debug=False,
|
||||
clip=20,
|
||||
)
|
||||
|
||||
self.proj_norm = LayerNorm(__C.FLAT_OUT_SIZE)
|
||||
|
||||
self.proj_norm_dnc = LayerNorm(__C.FLAT_OUT_SIZE + __C.N_READ_HEADS_DNC * __C.WORD_LENGTH_DNC)
|
||||
self.linear_dnc = FC(__C.FLAT_OUT_SIZE + __C.N_READ_HEADS_DNC * __C.WORD_LENGTH_DNC, __C.FLAT_OUT_SIZE, dropout_r=0.2)
|
||||
self.proj = nn.Linear(__C.FLAT_OUT_SIZE, answer_size)
|
||||
|
||||
def forward(self, frame_feat, clip_feat, ques_ix):
|
||||
|
||||
# Make mask
|
||||
lang_feat_mask = self.make_mask(ques_ix.unsqueeze(2))
|
||||
frame_feat_mask = self.make_mask(frame_feat)
|
||||
clip_feat_mask = self.make_mask(clip_feat)
|
||||
|
||||
# Pre-process Language Feature
|
||||
lang_feat = self.embedding(ques_ix)
|
||||
lang_feat, _ = self.lstm(lang_feat)
|
||||
|
||||
|
||||
# Pre-process Video Feature
|
||||
frame_feat = self.frame_feat_linear(frame_feat)
|
||||
clip_feat = self.clip_feat_linear(clip_feat)
|
||||
|
||||
# Backbone Framework
|
||||
lang_feat, frame_feat, clip_feat = self.backbone(
|
||||
lang_feat,
|
||||
frame_feat,
|
||||
clip_feat,
|
||||
lang_feat_mask,
|
||||
frame_feat_mask,
|
||||
clip_feat_mask
|
||||
)
|
||||
|
||||
lang_feat = self.attflat_lang(
|
||||
lang_feat,
|
||||
lang_feat_mask
|
||||
)
|
||||
|
||||
frame_feat = self.attflat_frame(
|
||||
frame_feat,
|
||||
frame_feat_mask
|
||||
)
|
||||
|
||||
clip_feat = self.attflat_clip(
|
||||
clip_feat,
|
||||
clip_feat_mask
|
||||
)
|
||||
proj_feat_0 = lang_feat + frame_feat + clip_feat
|
||||
proj_feat_0 = self.proj_norm(proj_feat_0)
|
||||
|
||||
proj_feat_1 = torch.stack([lang_feat, frame_feat, clip_feat], dim=1)
|
||||
proj_feat_1, (_, _, rv), _ = self.dnc(proj_feat_1, (None, None, None), reset_experience=True, pass_through_memory=True)
|
||||
proj_feat_1 = proj_feat_1.sum(1)
|
||||
proj_feat_1 = torch.cat([proj_feat_1, rv], dim=-1)
|
||||
proj_feat_1 = self.proj_norm_dnc(proj_feat_1)
|
||||
proj_feat_1 = self.linear_dnc(proj_feat_1)
|
||||
# proj_feat_1 = self.proj_norm(proj_feat_1)
|
||||
|
||||
proj_feat = torch.sigmoid(self.proj(proj_feat_0 + proj_feat_1))
|
||||
|
||||
return proj_feat
|
||||
|
||||
def load_pretrained_weights(self):
|
||||
pretrained_msvd = torch.load(self.pretrained_path)['state_dict']
|
||||
for n_pretrained, p_pretrained in pretrained_msvd.items():
|
||||
if 'dnc' in n_pretrained:
|
||||
self.state_dict()[n_pretrained].copy_(p_pretrained)
|
||||
print('Pre-trained dnc-weights successfully loaded!')
|
||||
|
||||
# Masking
|
||||
def make_mask(self, feature):
|
||||
return (torch.sum(
|
||||
torch.abs(feature),
|
||||
dim=-1
|
||||
) == 0).unsqueeze(1).unsqueeze(2)
|
||||
|
||||
class Net2(nn.Module):
|
||||
def __init__(self, __C, pretrained_emb, token_size, answer_size):
|
||||
super(Net2, self).__init__()
|
||||
print('Training with Network type 2: VLCN-FLF')
|
||||
self.embedding = nn.Embedding(
|
||||
num_embeddings=token_size,
|
||||
embedding_dim=__C.WORD_EMBED_SIZE
|
||||
)
|
||||
# Loading the GloVe embedding weights
|
||||
if __C.USE_GLOVE:
|
||||
self.embedding.weight.data.copy_(torch.from_numpy(pretrained_emb))
|
||||
|
||||
self.lstm = nn.LSTM(
|
||||
input_size=__C.WORD_EMBED_SIZE,
|
||||
hidden_size=__C.HIDDEN_SIZE,
|
||||
num_layers=1,
|
||||
batch_first=True
|
||||
)
|
||||
|
||||
self.frame_feat_linear = nn.Linear(
|
||||
__C.FRAME_FEAT_SIZE,
|
||||
__C.HIDDEN_SIZE
|
||||
)
|
||||
|
||||
self.clip_feat_linear = nn.Linear(
|
||||
__C.CLIP_FEAT_SIZE,
|
||||
__C.HIDDEN_SIZE
|
||||
)
|
||||
self.backbone = VLC(__C)
|
||||
|
||||
self.attflat_lang = AttFlat(__C)
|
||||
self.attflat_frame = AttFlat(__C)
|
||||
self.attflat_clip = AttFlat(__C)
|
||||
|
||||
self.proj_norm = LayerNorm(__C.FLAT_OUT_SIZE)
|
||||
self.proj = nn.Linear(__C.FLAT_OUT_SIZE, answer_size)
|
||||
|
||||
|
||||
def forward(self, frame_feat, clip_feat, ques_ix):
|
||||
|
||||
# Make mask
|
||||
lang_feat_mask = self.make_mask(ques_ix.unsqueeze(2))
|
||||
frame_feat_mask = self.make_mask(frame_feat)
|
||||
clip_feat_mask = self.make_mask(clip_feat)
|
||||
|
||||
# Pre-process Language Feature
|
||||
lang_feat = self.embedding(ques_ix)
|
||||
lang_feat, _ = self.lstm(lang_feat)
|
||||
|
||||
|
||||
# Pre-process Video Feature
|
||||
frame_feat = self.frame_feat_linear(frame_feat)
|
||||
clip_feat = self.clip_feat_linear(clip_feat)
|
||||
|
||||
# Backbone Framework
|
||||
lang_feat, frame_feat, clip_feat = self.backbone(
|
||||
lang_feat,
|
||||
frame_feat,
|
||||
clip_feat,
|
||||
lang_feat_mask,
|
||||
frame_feat_mask,
|
||||
clip_feat_mask
|
||||
)
|
||||
|
||||
lang_feat = self.attflat_lang(
|
||||
lang_feat,
|
||||
lang_feat_mask
|
||||
)
|
||||
|
||||
frame_feat = self.attflat_frame(
|
||||
frame_feat,
|
||||
frame_feat_mask
|
||||
)
|
||||
|
||||
clip_feat = self.attflat_clip(
|
||||
clip_feat,
|
||||
clip_feat_mask
|
||||
)
|
||||
proj_feat = lang_feat + frame_feat + clip_feat
|
||||
proj_feat = self.proj_norm(proj_feat)
|
||||
proj_feat = torch.sigmoid(self.proj(proj_feat))
|
||||
|
||||
return proj_feat
|
||||
# Masking
|
||||
def make_mask(self, feature):
|
||||
return (torch.sum(
|
||||
torch.abs(feature),
|
||||
dim=-1
|
||||
) == 0).unsqueeze(1).unsqueeze(2)
|
||||
|
||||
class Net3(nn.Module):
|
||||
def __init__(self, __C, pretrained_emb, token_size, answer_size):
|
||||
super(Net3, self).__init__()
|
||||
print('Training with Network type 3: VLCN+LSTM')
|
||||
|
||||
self.embedding = nn.Embedding(
|
||||
num_embeddings=token_size,
|
||||
embedding_dim=__C.WORD_EMBED_SIZE
|
||||
)
|
||||
|
||||
# Loading the GloVe embedding weights
|
||||
if __C.USE_GLOVE:
|
||||
self.embedding.weight.data.copy_(torch.from_numpy(pretrained_emb))
|
||||
|
||||
self.lstm = nn.LSTM(
|
||||
input_size=__C.WORD_EMBED_SIZE,
|
||||
hidden_size=__C.HIDDEN_SIZE,
|
||||
num_layers=1,
|
||||
batch_first=True
|
||||
)
|
||||
|
||||
self.frame_feat_linear = nn.Linear(
|
||||
__C.FRAME_FEAT_SIZE,
|
||||
__C.HIDDEN_SIZE
|
||||
)
|
||||
|
||||
self.clip_feat_linear = nn.Linear(
|
||||
__C.CLIP_FEAT_SIZE,
|
||||
__C.HIDDEN_SIZE
|
||||
)
|
||||
self.backbone = VLC(__C)
|
||||
|
||||
self.attflat_lang = AttFlat(__C)
|
||||
self.attflat_frame = AttFlat(__C)
|
||||
self.attflat_clip = AttFlat(__C)
|
||||
|
||||
self.lstm_fusion = nn.LSTM(
|
||||
input_size=__C.FLAT_OUT_SIZE,
|
||||
hidden_size=__C.FLAT_OUT_SIZE,
|
||||
num_layers=2,
|
||||
batch_first=True,
|
||||
bidirectional=True
|
||||
)
|
||||
|
||||
self.proj_norm = LayerNorm(__C.FLAT_OUT_SIZE)
|
||||
self.proj_feat_1 = nn.Linear(__C.FLAT_OUT_SIZE * 2, __C.FLAT_OUT_SIZE)
|
||||
|
||||
self.proj_norm_lstm = LayerNorm(__C.FLAT_OUT_SIZE)
|
||||
self.proj = nn.Linear(__C.FLAT_OUT_SIZE, answer_size)
|
||||
|
||||
def forward(self, frame_feat, clip_feat, ques_ix):
|
||||
|
||||
# Make mask
|
||||
lang_feat_mask = self.make_mask(ques_ix.unsqueeze(2))
|
||||
frame_feat_mask = self.make_mask(frame_feat)
|
||||
clip_feat_mask = self.make_mask(clip_feat)
|
||||
|
||||
# Pre-process Language Feature
|
||||
lang_feat = self.embedding(ques_ix)
|
||||
lang_feat, _ = self.lstm(lang_feat)
|
||||
|
||||
|
||||
# Pre-process Video Feature
|
||||
frame_feat = self.frame_feat_linear(frame_feat)
|
||||
clip_feat = self.clip_feat_linear(clip_feat)
|
||||
|
||||
# Backbone Framework
|
||||
lang_feat, frame_feat, clip_feat = self.backbone(
|
||||
lang_feat,
|
||||
frame_feat,
|
||||
clip_feat,
|
||||
lang_feat_mask,
|
||||
frame_feat_mask,
|
||||
clip_feat_mask
|
||||
)
|
||||
|
||||
lang_feat = self.attflat_lang(
|
||||
lang_feat,
|
||||
lang_feat_mask
|
||||
)
|
||||
|
||||
frame_feat = self.attflat_frame(
|
||||
frame_feat,
|
||||
frame_feat_mask
|
||||
)
|
||||
|
||||
clip_feat = self.attflat_clip(
|
||||
clip_feat,
|
||||
clip_feat_mask
|
||||
)
|
||||
proj_feat_0 = lang_feat + frame_feat + clip_feat
|
||||
proj_feat_0 = self.proj_norm(proj_feat_0)
|
||||
|
||||
proj_feat_1 = torch.stack([lang_feat, frame_feat, clip_feat], dim=1)
|
||||
proj_feat_1, _ = self.lstm_fusion(proj_feat_1)
|
||||
proj_feat_1 = proj_feat_1.sum(1)
|
||||
proj_feat_1 = self.proj_feat_1(proj_feat_1)
|
||||
proj_feat_1 = self.proj_norm_lstm(proj_feat_1)
|
||||
|
||||
proj_feat = torch.sigmoid(self.proj(proj_feat_0 + proj_feat_1))
|
||||
|
||||
return proj_feat
|
||||
|
||||
# Masking
|
||||
def make_mask(self, feature):
|
||||
return (torch.sum(
|
||||
torch.abs(feature),
|
||||
dim=-1
|
||||
) == 0).unsqueeze(1).unsqueeze(2)
|
||||
|
||||
class Net4(nn.Module):
|
||||
def __init__(self, __C, pretrained_emb, token_size, answer_size):
|
||||
super(Net4, self).__init__()
|
||||
print('Training with Network type 4: MCAN')
|
||||
self.embedding = nn.Embedding(
|
||||
num_embeddings=token_size,
|
||||
embedding_dim=__C.WORD_EMBED_SIZE
|
||||
)
|
||||
|
||||
# Loading the GloVe embedding weights
|
||||
if __C.USE_GLOVE:
|
||||
self.embedding.weight.data.copy_(torch.from_numpy(pretrained_emb))
|
||||
|
||||
self.lstm = nn.LSTM(
|
||||
input_size=__C.WORD_EMBED_SIZE,
|
||||
hidden_size=__C.HIDDEN_SIZE,
|
||||
num_layers=1,
|
||||
batch_first=True
|
||||
)
|
||||
|
||||
self.frame_feat_linear = nn.Linear(
|
||||
__C.FRAME_FEAT_SIZE,
|
||||
__C.HIDDEN_SIZE
|
||||
)
|
||||
|
||||
self.clip_feat_linear = nn.Linear(
|
||||
__C.CLIP_FEAT_SIZE,
|
||||
__C.HIDDEN_SIZE
|
||||
)
|
||||
self.backbone = MCA_ED(__C)
|
||||
|
||||
self.attflat_lang = AttFlat(__C)
|
||||
self.attflat_vid = AttFlat(__C)
|
||||
|
||||
self.proj_norm = LayerNorm(__C.FLAT_OUT_SIZE)
|
||||
self.proj = nn.Linear(__C.FLAT_OUT_SIZE, answer_size)
|
||||
|
||||
|
||||
def forward(self, frame_feat, clip_feat, ques_ix):
|
||||
|
||||
# Make mask
|
||||
lang_feat_mask = self.make_mask(ques_ix.unsqueeze(2))
|
||||
frame_feat_mask = self.make_mask(frame_feat)
|
||||
clip_feat_mask = self.make_mask(clip_feat)
|
||||
|
||||
# Pre-process Language Feature
|
||||
lang_feat = self.embedding(ques_ix)
|
||||
lang_feat, _ = self.lstm(lang_feat)
|
||||
|
||||
|
||||
# Pre-process Video Feature
|
||||
frame_feat = self.frame_feat_linear(frame_feat)
|
||||
clip_feat = self.clip_feat_linear(clip_feat)
|
||||
|
||||
# concat frame and clip features
|
||||
vid_feat = torch.cat([frame_feat, clip_feat], dim=1)
|
||||
vid_feat_mask = torch.cat([frame_feat_mask, clip_feat_mask], dim=-1)
|
||||
# Backbone Framework
|
||||
lang_feat, vid_feat = self.backbone(
|
||||
lang_feat,
|
||||
vid_feat,
|
||||
lang_feat_mask,
|
||||
vid_feat_mask,
|
||||
)
|
||||
|
||||
lang_feat = self.attflat_lang(
|
||||
lang_feat,
|
||||
lang_feat_mask
|
||||
)
|
||||
|
||||
vid_feat = self.attflat_vid(
|
||||
vid_feat,
|
||||
vid_feat_mask
|
||||
)
|
||||
|
||||
proj_feat = lang_feat + vid_feat
|
||||
proj_feat = self.proj_norm(proj_feat)
|
||||
proj_feat = torch.sigmoid(self.proj(proj_feat))
|
||||
|
||||
return proj_feat
|
||||
|
||||
# Masking
|
||||
def make_mask(self, feature):
|
||||
return (torch.sum(
|
||||
torch.abs(feature),
|
||||
dim=-1
|
||||
) == 0).unsqueeze(1).unsqueeze(2)
|
||||
|
||||
|
62
core/model/net_utils.py
Normal file
62
core/model/net_utils.py
Normal file
|
@ -0,0 +1,62 @@
|
|||
# --------------------------------------------------------
|
||||
# mcan-vqa (Deep Modular Co-Attention Networks)
|
||||
# Licensed under The MIT License [see LICENSE for details]
|
||||
# Written by Yuhao Cui https://github.com/cuiyuhao1996
|
||||
# --------------------------------------------------------
|
||||
|
||||
import torch.nn as nn
|
||||
import os
|
||||
import torch
|
||||
|
||||
|
||||
class FC(nn.Module):
|
||||
def __init__(self, in_size, out_size, dropout_r=0., use_relu=True):
|
||||
super(FC, self).__init__()
|
||||
self.dropout_r = dropout_r
|
||||
self.use_relu = use_relu
|
||||
|
||||
self.linear = nn.Linear(in_size, out_size)
|
||||
|
||||
if use_relu:
|
||||
self.relu = nn.ReLU(inplace=True)
|
||||
|
||||
if dropout_r > 0:
|
||||
self.dropout = nn.Dropout(dropout_r)
|
||||
|
||||
def forward(self, x):
|
||||
x = self.linear(x)
|
||||
|
||||
if self.use_relu:
|
||||
x = self.relu(x)
|
||||
|
||||
if self.dropout_r > 0:
|
||||
x = self.dropout(x)
|
||||
|
||||
return x
|
||||
|
||||
|
||||
class MLP(nn.Module):
|
||||
def __init__(self, in_size, mid_size, out_size, dropout_r=0., use_relu=True):
|
||||
super(MLP, self).__init__()
|
||||
|
||||
self.fc = FC(in_size, mid_size, dropout_r=dropout_r, use_relu=use_relu)
|
||||
self.linear = nn.Linear(mid_size, out_size)
|
||||
|
||||
def forward(self, x):
|
||||
return self.linear(self.fc(x))
|
||||
|
||||
|
||||
class LayerNorm(nn.Module):
|
||||
def __init__(self, size, eps=1e-6):
|
||||
super(LayerNorm, self).__init__()
|
||||
self.eps = eps
|
||||
|
||||
self.a_2 = nn.Parameter(torch.ones(size))
|
||||
self.b_2 = nn.Parameter(torch.zeros(size))
|
||||
|
||||
def forward(self, x):
|
||||
mean = x.mean(-1, keepdim=True)
|
||||
std = x.std(-1, keepdim=True)
|
||||
|
||||
return self.a_2 * (x - mean) / (std + self.eps) + self.b_2
|
||||
|
98
core/model/optim.py
Normal file
98
core/model/optim.py
Normal file
|
@ -0,0 +1,98 @@
|
|||
# --------------------------------------------------------
|
||||
# mcan-vqa (Deep Modular Co-Attention Networks)
|
||||
# Licensed under The MIT License [see LICENSE for details]
|
||||
# Written by Yuhao Cui https://github.com/cuiyuhao1996
|
||||
# --------------------------------------------------------
|
||||
|
||||
import torch
|
||||
import torch.optim as Optim
|
||||
|
||||
|
||||
class WarmupOptimizer(object):
|
||||
def __init__(self, lr_base, optimizer, data_size, batch_size):
|
||||
self.optimizer = optimizer
|
||||
self._step = 0
|
||||
self.lr_base = lr_base
|
||||
self._rate = 0
|
||||
self.data_size = data_size
|
||||
self.batch_size = batch_size
|
||||
|
||||
def step(self):
|
||||
self._step += 1
|
||||
|
||||
rate = self.rate()
|
||||
for p in self.optimizer.param_groups:
|
||||
p['lr'] = rate
|
||||
self._rate = rate
|
||||
|
||||
self.optimizer.step()
|
||||
|
||||
|
||||
def zero_grad(self):
|
||||
self.optimizer.zero_grad()
|
||||
|
||||
|
||||
def rate(self, step=None):
|
||||
if step is None:
|
||||
step = self._step
|
||||
|
||||
if step <= int(self.data_size / self.batch_size * 1):
|
||||
r = self.lr_base * 1/4.
|
||||
elif step <= int(self.data_size / self.batch_size * 2):
|
||||
r = self.lr_base * 2/4.
|
||||
elif step <= int(self.data_size / self.batch_size * 3):
|
||||
r = self.lr_base * 3/4.
|
||||
else:
|
||||
r = self.lr_base
|
||||
|
||||
return r
|
||||
|
||||
|
||||
def get_optim(__C, model, data_size, optimizer, lr_base=None):
|
||||
if lr_base is None:
|
||||
lr_base = __C.LR_BASE
|
||||
|
||||
# modules = model._modules
|
||||
# params_list = []
|
||||
# for m in modules:
|
||||
# if 'dnc' in m:
|
||||
# params_list.append({
|
||||
# 'params': filter(lambda p: p.requires_grad, modules[m].parameters()),
|
||||
# 'lr': __C.LR_DNC_BASE,
|
||||
# 'flag': True
|
||||
# })
|
||||
# else:
|
||||
# params_list.append({
|
||||
# 'params': filter(lambda p: p.requires_grad, modules[m].parameters()),
|
||||
|
||||
# })
|
||||
if optimizer == 'adam':
|
||||
optim = Optim.Adam(
|
||||
filter(lambda p: p.requires_grad, model.parameters()),
|
||||
lr=0,
|
||||
betas=__C.OPT_BETAS,
|
||||
eps=__C.OPT_EPS,
|
||||
|
||||
)
|
||||
elif optimizer == 'rmsprop':
|
||||
optim = Optim.RMSprop(
|
||||
filter(lambda p: p.requires_grad, model.parameters()),
|
||||
lr=0,
|
||||
eps=__C.OPT_EPS,
|
||||
weight_decay=__C.OPT_WEIGHT_DECAY
|
||||
)
|
||||
else:
|
||||
raise ValueError('{} optimizer is not supported'.fromat(optimizer))
|
||||
return WarmupOptimizer(
|
||||
lr_base,
|
||||
optim,
|
||||
data_size,
|
||||
__C.BATCH_SIZE
|
||||
)
|
||||
|
||||
|
||||
def adjust_lr(optim, decay_r):
|
||||
optim.lr_base *= decay_r
|
||||
|
||||
def adjust_lr_dnc(optim, decay_r):
|
||||
optim.lr_dnc_base *= decay_r
|
163
core/model/utils.py
Normal file
163
core/model/utils.py
Normal file
|
@ -0,0 +1,163 @@
|
|||
"""
|
||||
PyTorch DNC implementation from
|
||||
-->
|
||||
https://github.com/ixaxaar/pytorch-dnc
|
||||
<--
|
||||
"""
|
||||
|
||||
import torch.nn as nn
|
||||
import torch as T
|
||||
import torch.nn.functional as F
|
||||
import numpy as np
|
||||
import torch
|
||||
from torch.autograd import Variable
|
||||
import re
|
||||
import string
|
||||
|
||||
|
||||
def recursiveTrace(obj):
|
||||
print(type(obj))
|
||||
if hasattr(obj, 'grad_fn'):
|
||||
print(obj.grad_fn)
|
||||
recursiveTrace(obj.grad_fn)
|
||||
elif hasattr(obj, 'saved_variables'):
|
||||
print(obj.requires_grad, len(obj.saved_tensors), len(obj.saved_variables))
|
||||
[print(v) for v in obj.saved_variables]
|
||||
[recursiveTrace(v.grad_fn) for v in obj.saved_variables]
|
||||
|
||||
|
||||
def cuda(x, grad=False, gpu_id=-1):
|
||||
x = x.float() if T.is_tensor(x) else x
|
||||
if gpu_id == -1:
|
||||
t = T.FloatTensor(x)
|
||||
t.requires_grad=grad
|
||||
return t
|
||||
else:
|
||||
t = T.FloatTensor(x.pin_memory()).cuda(gpu_id)
|
||||
t.requires_grad=grad
|
||||
return t
|
||||
|
||||
|
||||
def cudavec(x, grad=False, gpu_id=-1):
|
||||
if gpu_id == -1:
|
||||
t = T.Tensor(T.from_numpy(x))
|
||||
t.requires_grad = grad
|
||||
return t
|
||||
else:
|
||||
t = T.Tensor(T.from_numpy(x).pin_memory()).cuda(gpu_id)
|
||||
t.requires_grad = grad
|
||||
return t
|
||||
|
||||
|
||||
def cudalong(x, grad=False, gpu_id=-1):
|
||||
if gpu_id == -1:
|
||||
t = T.LongTensor(T.from_numpy(x.astype(np.long)))
|
||||
t.requires_grad = grad
|
||||
return t
|
||||
else:
|
||||
t = T.LongTensor(T.from_numpy(x.astype(np.long)).pin_memory()).cuda(gpu_id)
|
||||
t.requires_grad = grad
|
||||
return t
|
||||
|
||||
|
||||
def θ(a, b, normBy=2):
|
||||
"""Batchwise Cosine similarity
|
||||
Cosine similarity
|
||||
Arguments:
|
||||
a {Tensor} -- A 3D Tensor (b * m * w)
|
||||
b {Tensor} -- A 3D Tensor (b * r * w)
|
||||
Returns:
|
||||
Tensor -- Batchwise cosine similarity (b * r * m)
|
||||
"""
|
||||
dot = T.bmm(a, b.transpose(1,2))
|
||||
a_norm = T.norm(a, normBy, dim=2).unsqueeze(2)
|
||||
b_norm = T.norm(b, normBy, dim=2).unsqueeze(1)
|
||||
cos = dot / (a_norm * b_norm + δ)
|
||||
return cos.transpose(1,2).contiguous()
|
||||
|
||||
|
||||
def σ(input, axis=1):
|
||||
"""Softmax on an axis
|
||||
Softmax on an axis
|
||||
Arguments:
|
||||
input {Tensor} -- input Tensor
|
||||
Keyword Arguments:
|
||||
axis {number} -- axis on which to take softmax on (default: {1})
|
||||
Returns:
|
||||
Tensor -- Softmax output Tensor
|
||||
"""
|
||||
input_size = input.size()
|
||||
|
||||
trans_input = input.transpose(axis, len(input_size) - 1)
|
||||
trans_size = trans_input.size()
|
||||
|
||||
input_2d = trans_input.contiguous().view(-1, trans_size[-1])
|
||||
soft_max_2d = F.softmax(input_2d, -1)
|
||||
soft_max_nd = soft_max_2d.view(*trans_size)
|
||||
return soft_max_nd.transpose(axis, len(input_size) - 1)
|
||||
|
||||
δ = 1e-6
|
||||
|
||||
|
||||
def register_nan_checks(model):
|
||||
def check_grad(module, grad_input, grad_output):
|
||||
# print(module) you can add this to see that the hook is called
|
||||
# print('hook called for ' + str(type(module)))
|
||||
if any(np.all(np.isnan(gi.data.cpu().numpy())) for gi in grad_input if gi is not None):
|
||||
print('NaN gradient in grad_input ' + type(module).__name__)
|
||||
|
||||
model.apply(lambda module: module.register_backward_hook(check_grad))
|
||||
|
||||
|
||||
def apply_dict(dic):
|
||||
for k, v in dic.items():
|
||||
apply_var(v, k)
|
||||
if isinstance(v, nn.Module):
|
||||
key_list = [a for a in dir(v) if not a.startswith('__')]
|
||||
for key in key_list:
|
||||
apply_var(getattr(v, key), key)
|
||||
for pk, pv in v._parameters.items():
|
||||
apply_var(pv, pk)
|
||||
|
||||
|
||||
def apply_var(v, k):
|
||||
if isinstance(v, Variable) and v.requires_grad:
|
||||
v.register_hook(check_nan_gradient(k))
|
||||
|
||||
|
||||
def check_nan_gradient(name=''):
|
||||
def f(tensor):
|
||||
if np.isnan(T.mean(tensor).data.cpu().numpy()):
|
||||
print('\nnan gradient of {} :'.format(name))
|
||||
# print(tensor)
|
||||
# assert 0, 'nan gradient'
|
||||
return tensor
|
||||
return f
|
||||
|
||||
def ptr(tensor):
|
||||
if T.is_tensor(tensor):
|
||||
return tensor.storage().data_ptr()
|
||||
elif hasattr(tensor, 'data'):
|
||||
return tensor.clone().data.storage().data_ptr()
|
||||
else:
|
||||
return tensor
|
||||
|
||||
# TODO: EWW change this shit
|
||||
def ensure_gpu(tensor, gpu_id):
|
||||
if "cuda" in str(type(tensor)) and gpu_id != -1:
|
||||
return tensor.cuda(gpu_id)
|
||||
elif "cuda" in str(type(tensor)):
|
||||
return tensor.cpu()
|
||||
elif "Tensor" in str(type(tensor)) and gpu_id != -1:
|
||||
return tensor.cuda(gpu_id)
|
||||
elif "Tensor" in str(type(tensor)):
|
||||
return tensor
|
||||
elif type(tensor) is np.ndarray:
|
||||
return cudavec(tensor, gpu_id=gpu_id).data
|
||||
else:
|
||||
return tensor
|
||||
|
||||
|
||||
def print_gradient(x, name):
|
||||
s = "Gradient of " + name + " ----------------------------------"
|
||||
x.register_hook(lambda y: print(s, y.squeeze()))
|
Loading…
Add table
Add a link
Reference in a new issue