Add NLP task models
This commit is contained in:
parent
d8beb17dfb
commit
69f6de0ace
46 changed files with 4976 additions and 0 deletions
0
joint_paraphrase_model/libs/__init__.py
Normal file
0
joint_paraphrase_model/libs/__init__.py
Normal file
416
joint_paraphrase_model/libs/corpora.py
Normal file
416
joint_paraphrase_model/libs/corpora.py
Normal file
|
@ -0,0 +1,416 @@
|
|||
import logging
|
||||
|
||||
import config
|
||||
|
||||
|
||||
def tokenize(sent):
|
||||
return sent.split(" ")
|
||||
|
||||
|
||||
class Lang:
|
||||
"""Represents the vocabulary
|
||||
"""
|
||||
def __init__(self, name):
|
||||
self.name = name
|
||||
self.word2index = {
|
||||
config.PAD: 0,
|
||||
config.UNK: 1,
|
||||
config.NOFIX: 2,
|
||||
config.SOS: 3,
|
||||
config.EOS: 4,
|
||||
}
|
||||
self.word2count = {}
|
||||
self.index2word = {
|
||||
0: config.PAD,
|
||||
1: config.UNK,
|
||||
2: config.NOFIX,
|
||||
3: config.SOS,
|
||||
4: config.EOS,
|
||||
}
|
||||
self.n_words = 5
|
||||
|
||||
def add_sentence(self, sentence):
|
||||
assert isinstance(
|
||||
sentence, (list, tuple)
|
||||
), "input to add_sentence must be tokenized"
|
||||
for word in sentence:
|
||||
self.add_word(word)
|
||||
|
||||
def add_word(self, word):
|
||||
if word not in self.word2index:
|
||||
self.word2index[word] = self.n_words
|
||||
self.word2count[word] = 1
|
||||
self.index2word[self.n_words] = word
|
||||
self.n_words += 1
|
||||
else:
|
||||
self.word2count[word] += 1
|
||||
|
||||
def __add__(self, other):
|
||||
"""Returns a new Lang object containing the vocabulary from this and
|
||||
the other Lang object
|
||||
"""
|
||||
new_lang = Lang(f"{self.name}_{other.name}")
|
||||
|
||||
# Add vocabulary from both Langs
|
||||
for word in self.word2count.keys():
|
||||
new_lang.add_word(word)
|
||||
for word in other.word2count.keys():
|
||||
new_lang.add_word(word)
|
||||
|
||||
# Fix the counts on the new one
|
||||
for word in new_lang.word2count.keys():
|
||||
new_lang.word2count[word] = self.word2count.get(
|
||||
word, 0
|
||||
) + other.word2count.get(word, 0)
|
||||
|
||||
return new_lang
|
||||
|
||||
|
||||
def load_wiki(split):
|
||||
"""Load the Wiki from PAWs"""
|
||||
logger = logging.getLogger(f"{__name__}.load_wiki")
|
||||
lang = Lang("wiki")
|
||||
|
||||
if split == "train":
|
||||
path = config.wiki_train_path
|
||||
elif split == "val":
|
||||
path = config.wiki_dev_path
|
||||
elif split == "test":
|
||||
path = config.wiki_test_path
|
||||
|
||||
logger.info("loading %s from %s" % (split, path))
|
||||
|
||||
pairs = []
|
||||
with open(path) as handle:
|
||||
|
||||
# skip header
|
||||
handle.readline()
|
||||
|
||||
for line in handle:
|
||||
_, sent1, sent2, rating = line.strip().split("\t")
|
||||
if rating == "0":
|
||||
continue
|
||||
sent1 = tokenize(sent1)
|
||||
sent2 = tokenize(sent2)
|
||||
lang.add_sentence(sent1)
|
||||
lang.add_sentence(sent2)
|
||||
|
||||
# pairs.append([sent1, sent2, rating])
|
||||
pairs.append([sent1, sent2])
|
||||
|
||||
# MS makes the vocab for paraphrase the same
|
||||
return pairs, lang
|
||||
|
||||
|
||||
def load_qqp_paws(split):
|
||||
"""Load the QQP from PAWs"""
|
||||
logger = logging.getLogger(f"{__name__}.load_qqp_paws")
|
||||
lang = Lang("qqp_paws")
|
||||
|
||||
if split == "train":
|
||||
path = config.qqp_paws_train_path
|
||||
elif split == "val":
|
||||
path = config.qqp_paws_dev_path
|
||||
elif split == "test":
|
||||
path = config.qqp_paws_test_path
|
||||
|
||||
logger.info("loading %s from %s" % (split, path))
|
||||
|
||||
pairs = []
|
||||
with open(path) as handle:
|
||||
|
||||
# skip header
|
||||
handle.readline()
|
||||
|
||||
for line in handle:
|
||||
_, sent1, sent2, rating = line.strip().split("\t")
|
||||
if rating == "0":
|
||||
continue
|
||||
sent1 = tokenize(sent1)
|
||||
sent2 = tokenize(sent2)
|
||||
lang.add_sentence(sent1)
|
||||
lang.add_sentence(sent2)
|
||||
|
||||
# pairs.append([sent1, sent2, rating])
|
||||
pairs.append([sent1, sent2])
|
||||
|
||||
# MS makes the vocab for paraphrase the same
|
||||
return pairs, lang
|
||||
|
||||
def load_qqp(split):
|
||||
"""Load the QQP from Original"""
|
||||
logger = logging.getLogger(f"{__name__}.load_qqp")
|
||||
lang = Lang("qqp")
|
||||
|
||||
if split == "train":
|
||||
path = config.qqp_train_path
|
||||
elif split == "val":
|
||||
path = config.qqp_dev_path
|
||||
elif split == "test":
|
||||
path = config.qqp_test_path
|
||||
|
||||
logger.info("loading %s from %s" % (split, path))
|
||||
|
||||
pairs = []
|
||||
with open(path) as handle:
|
||||
|
||||
# skip header
|
||||
handle.readline()
|
||||
|
||||
for line in handle:
|
||||
rating, sent1, sent2, _ = line.strip().split("\t")
|
||||
if rating == "0":
|
||||
continue
|
||||
sent1 = tokenize(sent1)
|
||||
sent2 = tokenize(sent2)
|
||||
lang.add_sentence(sent1)
|
||||
lang.add_sentence(sent2)
|
||||
|
||||
# pairs.append([sent1, sent2, rating])
|
||||
pairs.append([sent1, sent2])
|
||||
|
||||
# MS makes the vocab for paraphrase the same
|
||||
return pairs, lang
|
||||
|
||||
|
||||
def load_qqp_kag(split):
|
||||
"""Load the QQP from Kaggle""" #not original right now, expriemnting with kaggle 100K, 3K, 30K split
|
||||
logger = logging.getLogger(f"{__name__}.load_qqp_kag")
|
||||
lang = Lang("qqp_kag")
|
||||
|
||||
if split == "train":
|
||||
path = config.qqp_kag_train_path
|
||||
elif split == "val":
|
||||
path = config.qqp_kag_dev_path
|
||||
elif split == "test":
|
||||
path = config.qqp_kag_test_path
|
||||
|
||||
logger.info("loading %s from %s" % (split, path))
|
||||
|
||||
pairs = []
|
||||
with open(path) as handle:
|
||||
|
||||
# skip header
|
||||
handle.readline()
|
||||
|
||||
for line in handle: #when reading the kag version we do not have 4 fields, but rather 3
|
||||
rating, sent1, sent2 = line.strip().split("\t")
|
||||
if rating == "0":
|
||||
continue
|
||||
sent1 = tokenize(sent1)
|
||||
sent2 = tokenize(sent2)
|
||||
lang.add_sentence(sent1)
|
||||
lang.add_sentence(sent2)
|
||||
|
||||
# pairs.append([sent1, sent2, rating])
|
||||
pairs.append([sent1, sent2])
|
||||
|
||||
# MS makes the vocab for paraphrase the same
|
||||
return pairs, lang
|
||||
|
||||
|
||||
def load_msrpc(split):
|
||||
"""Load the Microsoft Research Paraphrase Corpus (MSRPC)"""
|
||||
logger = logging.getLogger(f"{__name__}.load_msrpc")
|
||||
lang = Lang("msrpc")
|
||||
|
||||
if split == "train":
|
||||
path = config.msrpc_train_path
|
||||
elif split == "val":
|
||||
path = config.msrpc_dev_path
|
||||
elif split == "test":
|
||||
path = config.msrpc_test_path
|
||||
|
||||
logger.info("loading %s from %s" % (split, path))
|
||||
|
||||
pairs = []
|
||||
with open(path) as handle:
|
||||
|
||||
# skip header
|
||||
handle.readline()
|
||||
|
||||
for line in handle:
|
||||
rating, _, _, sent1, sent2 = line.strip().split("\t")
|
||||
if rating == "0":
|
||||
continue
|
||||
sent1 = tokenize(sent1)
|
||||
sent2 = tokenize(sent2)
|
||||
lang.add_sentence(sent1)
|
||||
lang.add_sentence(sent2)
|
||||
|
||||
# pairs.append([sent1, sent2, rating])
|
||||
pairs.append([sent1, sent2])
|
||||
|
||||
# return src_lang, dst_lang, pairs
|
||||
# MS makes the vocab for paraphrase the same
|
||||
|
||||
return pairs, lang
|
||||
|
||||
def load_sentiment(split):
|
||||
"""Load the Sentiment Kaggle Comp Dataset"""
|
||||
logger = logging.getLogger(f"{__name__}.load_sentiment")
|
||||
lang = Lang("sentiment")
|
||||
|
||||
if split == "train":
|
||||
path = config.sentiment_train_path
|
||||
elif split == "val":
|
||||
path = config.sentiment_dev_path
|
||||
elif split == "test":
|
||||
path = config.sentiment_test_path
|
||||
|
||||
logger.info("loading %s from %s" % (split, path))
|
||||
|
||||
pairs = []
|
||||
|
||||
with open(path) as handle:
|
||||
|
||||
# skip header
|
||||
handle.readline()
|
||||
|
||||
for line in handle:
|
||||
_, _, sent1, sent2 = line.strip().split("\t")
|
||||
|
||||
sent1 = tokenize(sent1)
|
||||
sent2 = tokenize(sent2)
|
||||
lang.add_sentence(sent1)
|
||||
lang.add_sentence(sent2)
|
||||
|
||||
# pairs.append([sent1, sent2, rating])
|
||||
pairs.append([sent1, sent2])
|
||||
|
||||
return pairs, lang
|
||||
|
||||
|
||||
def load_tamil(split):
|
||||
"""Load the En to Tamil dataset, current SOTA ~13 bleu"""
|
||||
logger = logging.getLogger(f"{__name__}.load_tamil")
|
||||
lang = Lang("tamil")
|
||||
|
||||
if split == "train":
|
||||
path = config.tamil_train_path
|
||||
elif split == "val":
|
||||
path = config.tamil_dev_path
|
||||
elif split == "test":
|
||||
path = config.tamil_test_path
|
||||
|
||||
logger.info("loading %s from %s" % (split, path))
|
||||
|
||||
pairs = []
|
||||
with open(path) as handle:
|
||||
|
||||
handle.readline()
|
||||
|
||||
for line in handle:
|
||||
sent1, sent2 = line.strip().split("\t")
|
||||
#if rating == "0":
|
||||
# continue
|
||||
sent1 = tokenize(sent1)
|
||||
#I dunno how to tokenize tamil.....?
|
||||
sent2 = tokenize(sent2)
|
||||
lang.add_sentence(sent1)
|
||||
lang.add_sentence(sent2)
|
||||
|
||||
pairs.append([sent1, sent2])
|
||||
|
||||
return pairs, lang
|
||||
|
||||
def load_compression(split):
|
||||
"""Load the Google Sentence Compression Dataset"""
|
||||
logger = logging.getLogger(f"{__name__}.load_compression")
|
||||
lang = Lang("compression")
|
||||
|
||||
if split == "train":
|
||||
path = config.compression_train_path
|
||||
elif split == "val":
|
||||
path = config.compression_dev_path
|
||||
elif split == "test":
|
||||
path = config.compression_test_path
|
||||
|
||||
logger.info("loading %s from %s" % (split, path))
|
||||
|
||||
pairs = []
|
||||
with open(path) as handle:
|
||||
|
||||
handle.readline()
|
||||
|
||||
for line in handle:
|
||||
sent1, sent2 = line.strip().split("\t")
|
||||
sent1 = tokenize(sent1)
|
||||
sent2 = tokenize(sent2)
|
||||
# print(len(sent1), sent1)
|
||||
# print(len(sent2), sent2)
|
||||
# print()
|
||||
lang.add_sentence(sent1)
|
||||
lang.add_sentence(sent2)
|
||||
|
||||
pairs.append([sent1, sent2])
|
||||
|
||||
return pairs, lang
|
||||
|
||||
def load_stanford(split):
|
||||
"""Load the Stanford Sentiment Dataset phrases"""
|
||||
logger = logging.getLogger(f"{__name__}.load_stanford")
|
||||
lang = Lang("stanford")
|
||||
|
||||
if split == "train":
|
||||
path = config.stanford_train_path
|
||||
elif split == "val":
|
||||
path = config.stanford_dev_path
|
||||
elif split == "test":
|
||||
path = config.stanford_test_path
|
||||
|
||||
logger.info("loading %s from %s" % (split, path))
|
||||
|
||||
pairs = []
|
||||
|
||||
with open(path) as handle:
|
||||
|
||||
# skip header
|
||||
#handle.readline()
|
||||
|
||||
for line in handle:
|
||||
_, _, sent1, sent2 = line.strip().split("\t")
|
||||
|
||||
sent1 = tokenize(sent1)
|
||||
sent2 = tokenize(sent2)
|
||||
lang.add_sentence(sent1)
|
||||
lang.add_sentence(sent2)
|
||||
|
||||
# pairs.append([sent1, sent2, rating])
|
||||
pairs.append([sent1, sent2])
|
||||
|
||||
return pairs, lang
|
||||
|
||||
def load_stanford_sent(split):
|
||||
"""Load the Stanford Sentiment Dataset sentences"""
|
||||
logger = logging.getLogger(f"{__name__}.load_stanford_sent")
|
||||
lang = Lang("stanford_sent")
|
||||
|
||||
if split == "train":
|
||||
path = config.stanford_sent_train_path
|
||||
elif split == "val":
|
||||
path = config.stanford_sent_dev_path
|
||||
elif split == "test":
|
||||
path = config.stanford_sent_test_path
|
||||
|
||||
logger.info("loading %s from %s" % (split, path))
|
||||
|
||||
pairs = []
|
||||
|
||||
with open(path) as handle:
|
||||
|
||||
# skip header
|
||||
#handle.readline()
|
||||
|
||||
for line in handle:
|
||||
_, _, sent1, sent2 = line.strip().split("\t")
|
||||
|
||||
sent1 = tokenize(sent1)
|
||||
sent2 = tokenize(sent2)
|
||||
lang.add_sentence(sent1)
|
||||
lang.add_sentence(sent2)
|
||||
|
||||
# pairs.append([sent1, sent2, rating])
|
||||
pairs.append([sent1, sent2])
|
||||
|
||||
return pairs, lang
|
|
@ -0,0 +1 @@
|
|||
from .main import *
|
131
joint_paraphrase_model/libs/fixation_generation/main.py
Normal file
131
joint_paraphrase_model/libs/fixation_generation/main.py
Normal file
|
@ -0,0 +1,131 @@
|
|||
from collections import OrderedDict
|
||||
import logging
|
||||
import sys
|
||||
|
||||
from .self_attention import Transformer
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
import torch.nn.functional as F
|
||||
from torch.nn.utils.rnn import pack_sequence, pack_padded_sequence, pad_packed_sequence
|
||||
|
||||
|
||||
def random_embedding(vocab_size, embedding_dim):
|
||||
pretrain_emb = np.empty([vocab_size, embedding_dim])
|
||||
scale = np.sqrt(3.0 / embedding_dim)
|
||||
for index in range(vocab_size):
|
||||
pretrain_emb[index, :] = np.random.uniform(-scale, scale, [1, embedding_dim])
|
||||
return pretrain_emb
|
||||
|
||||
|
||||
def neg_log_likelihood_loss(outputs, batch_label, batch_size, seq_len):
|
||||
outputs = outputs.view(batch_size * seq_len, -1)
|
||||
score = F.log_softmax(outputs, 1)
|
||||
|
||||
loss = nn.NLLLoss(ignore_index=0, size_average=False)(
|
||||
score, batch_label.view(batch_size * seq_len)
|
||||
)
|
||||
loss = loss / batch_size
|
||||
_, tag_seq = torch.max(score, 1)
|
||||
tag_seq = tag_seq.view(batch_size, seq_len)
|
||||
|
||||
# print(score[0], tag_seq[0])
|
||||
|
||||
return loss, tag_seq
|
||||
|
||||
|
||||
def mse_loss(outputs, batch_label, batch_size, seq_len, word_seq_length):
|
||||
# score = torch.nn.functional.softmax(outputs, 1)
|
||||
score = torch.sigmoid(outputs)
|
||||
|
||||
mask = torch.zeros_like(score)
|
||||
for i, v in enumerate(word_seq_length):
|
||||
mask[i, 0:v] = 1
|
||||
|
||||
score = score * mask
|
||||
|
||||
loss = nn.MSELoss(reduction="sum")(
|
||||
score.view(batch_size, seq_len), batch_label.view(batch_size, seq_len)
|
||||
)
|
||||
|
||||
loss = loss / batch_size
|
||||
|
||||
return loss, score.view(batch_size, seq_len)
|
||||
|
||||
|
||||
class Network(nn.Module):
|
||||
def __init__(
|
||||
self,
|
||||
embedding_type,
|
||||
vocab_size,
|
||||
embedding_dim,
|
||||
dropout,
|
||||
hidden_dim,
|
||||
embeddings=None,
|
||||
attention=True,
|
||||
):
|
||||
super().__init__()
|
||||
self.logger = logging.getLogger(f"{__name__}")
|
||||
prelayers = OrderedDict()
|
||||
postlayers = OrderedDict()
|
||||
|
||||
if embedding_type in ("w2v", "glove"):
|
||||
if embeddings is not None:
|
||||
prelayers["embedding_layer"] = nn.Embedding.from_pretrained(embeddings)
|
||||
else:
|
||||
prelayers["embedding_layer"] = nn.Embedding(vocab_size, embedding_dim)
|
||||
prelayers["embedding_dropout_layer"] = nn.Dropout(dropout)
|
||||
embedding_dim = 300
|
||||
elif embedding_type == "bert":
|
||||
embedding_dim = 768
|
||||
|
||||
self.lstm = BiLSTM(embedding_dim, hidden_dim // 2, num_layers=1)
|
||||
postlayers["lstm_dropout_layer"] = nn.Dropout(dropout)
|
||||
|
||||
if attention:
|
||||
# increased compl with 1024D, and 16,16: for no att and att experiments
|
||||
# before: for the initial att and pretraining: heads 4 and layers 4, 128D
|
||||
# then was 128 D with heads 4 layer 1 = results for all IUI
|
||||
###postlayers["position_encodings"] = PositionalEncoding(hidden_dim)
|
||||
postlayers["attention_layer"] = Transformer(
|
||||
d_model=hidden_dim, n_heads=4, n_layers=1
|
||||
)
|
||||
|
||||
postlayers["ff_layer"] = nn.Linear(hidden_dim, hidden_dim // 2)
|
||||
postlayers["ff_activation"] = nn.ReLU()
|
||||
postlayers["output_layer"] = nn.Linear(hidden_dim // 2, 1)
|
||||
|
||||
self.logger.info(f"prelayers: {prelayers.keys()}")
|
||||
self.logger.info(f"postlayers: {postlayers.keys()}")
|
||||
|
||||
self.pre = nn.Sequential(prelayers)
|
||||
self.post = nn.Sequential(postlayers)
|
||||
|
||||
def forward(self, x, word_seq_length):
|
||||
x = self.pre(x)
|
||||
x = self.lstm(x, word_seq_length)
|
||||
#MS pritning fix model params
|
||||
#for p in self.parameters():
|
||||
# print(p.data)
|
||||
# break
|
||||
|
||||
return self.post(x.transpose(1, 0))
|
||||
|
||||
|
||||
class BiLSTM(nn.Module):
|
||||
def __init__(self, embedding_dim, lstm_hidden, num_layers):
|
||||
super().__init__()
|
||||
self.net = nn.LSTM(
|
||||
input_size=embedding_dim,
|
||||
hidden_size=lstm_hidden,
|
||||
num_layers=num_layers,
|
||||
batch_first=True,
|
||||
bidirectional=True,
|
||||
)
|
||||
|
||||
def forward(self, x, word_seq_length):
|
||||
packed_words = pack_padded_sequence(x, word_seq_length, True, False)
|
||||
lstm_out, hidden = self.net(packed_words)
|
||||
lstm_out, _ = pad_packed_sequence(lstm_out)
|
||||
return lstm_out
|
|
@ -0,0 +1,131 @@
|
|||
import torch
|
||||
import torch.nn as nn
|
||||
import torch.nn.functional as F
|
||||
|
||||
import numpy as np
|
||||
|
||||
import math
|
||||
|
||||
|
||||
class PositionalEncoding(nn.Module):
|
||||
def __init__(self, d_hid, n_position=200):
|
||||
super(PositionalEncoding, self).__init__()
|
||||
|
||||
# Not a parameter
|
||||
self.register_buffer('pos_table', self._get_sinusoid_encoding_table(n_position, d_hid))
|
||||
|
||||
def _get_sinusoid_encoding_table(self, n_position, d_hid):
|
||||
''' Sinusoid position encoding table '''
|
||||
# TODO: make it with torch instead of numpy
|
||||
|
||||
def get_position_angle_vec(position):
|
||||
return [position / np.power(10000, 2 * (hid_j // 2) / d_hid) for hid_j in range(d_hid)]
|
||||
|
||||
sinusoid_table = np.array([get_position_angle_vec(pos_i) for pos_i in range(n_position)])
|
||||
sinusoid_table[:, 0::2] = np.sin(sinusoid_table[:, 0::2]) # dim 2i
|
||||
sinusoid_table[:, 1::2] = np.cos(sinusoid_table[:, 1::2]) # dim 2i+1
|
||||
|
||||
return torch.FloatTensor(sinusoid_table).unsqueeze(0)
|
||||
|
||||
def forward(self, x):
|
||||
return x + self.pos_table[:, :x.size(1)].clone().detach()
|
||||
|
||||
|
||||
class AttentionLayer(nn.Module):
|
||||
def __init__(self):
|
||||
super(AttentionLayer, self).__init__()
|
||||
|
||||
def forward(self, Q, K, V):
|
||||
# Q: float32:[batch_size, n_queries, d_k]
|
||||
# K: float32:[batch_size, n_keys, d_k]
|
||||
# V: float32:[batch_size, n_keys, d_v]
|
||||
dk = K.shape[-1]
|
||||
dv = V.shape[-1]
|
||||
KT = torch.transpose(K, -1, -2)
|
||||
weight_logits = torch.bmm(Q, KT) / math.sqrt(dk)
|
||||
# weight_logits: float32[batch_size, n_queries, n_keys]
|
||||
weights = F.softmax(weight_logits, dim=-1)
|
||||
# weight: float32[batch_size, n_queries, n_keys]
|
||||
return torch.bmm(weights, V)
|
||||
# return float32[batch_size, n_queries, dv]
|
||||
|
||||
|
||||
class MultiHeadedSelfAttentionLayer(nn.Module):
|
||||
def __init__(self, d_model, n_heads):
|
||||
super(MultiHeadedSelfAttentionLayer, self).__init__()
|
||||
self.d_model = d_model
|
||||
self.n_heads = n_heads
|
||||
print('{} {}'.format(d_model, n_heads))
|
||||
assert d_model % n_heads == 0
|
||||
self.d_k = d_model // n_heads
|
||||
self.d_v = self.d_k
|
||||
self.attention_layer = AttentionLayer()
|
||||
self.W_Qs = nn.ModuleList([
|
||||
nn.Linear(d_model, self.d_k, bias=False)
|
||||
for _ in range(n_heads)
|
||||
])
|
||||
self.W_Ks = nn.ModuleList([
|
||||
nn.Linear(d_model, self.d_k, bias=False)
|
||||
for _ in range(n_heads)
|
||||
])
|
||||
self.W_Vs = nn.ModuleList([
|
||||
nn.Linear(d_model, self.d_v, bias=False)
|
||||
for _ in range(n_heads)
|
||||
])
|
||||
self.W_O = nn.Linear(d_model, d_model, bias=False)
|
||||
|
||||
def forward(self, x):
|
||||
# x:float32[batch_size, sequence_length, self.d_model]
|
||||
head_outputs = []
|
||||
for W_Q, W_K, W_V in zip(self.W_Qs, self.W_Ks, self.W_Vs):
|
||||
Q = W_Q(x)
|
||||
# Q float32:[batch_size, sequence_length, self.d_k]
|
||||
K = W_K(x)
|
||||
# Q float32:[batch_size, sequence_length, self.d_k]
|
||||
V = W_V(x)
|
||||
# Q float32:[batch_size, sequence_length, self.d_v]
|
||||
head_output = self.attention_layer(Q, K, V)
|
||||
# float32:[batch_size, sequence_length, self.d_v]
|
||||
head_outputs.append(head_output)
|
||||
concatenated = torch.cat(head_outputs, dim=-1)
|
||||
# concatenated float32:[batch_size, sequence_length, self.d_model]
|
||||
out = self.W_O(concatenated)
|
||||
# out float32:[batch_size, sequence_length, self.d_model]
|
||||
return out
|
||||
|
||||
class Feedforward(nn.Module):
|
||||
def __init__(self, d_model):
|
||||
super(Feedforward, self).__init__()
|
||||
self.d_model = d_model
|
||||
self.W1 = nn.Linear(d_model, d_model)
|
||||
self.W2 = nn.Linear(d_model, d_model)
|
||||
|
||||
def forward(self, x):
|
||||
# x: float32[batch_size, sequence_length, d_model]
|
||||
return self.W2(torch.relu(self.W1(x)))
|
||||
|
||||
class Transformer(nn.Module):
|
||||
def __init__(self, d_model, n_heads, n_layers):
|
||||
super(Transformer, self).__init__()
|
||||
self.d_model = d_model
|
||||
self.n_heads = n_heads
|
||||
self.n_layers = n_layers
|
||||
self.attention_layers = nn.ModuleList([
|
||||
MultiHeadedSelfAttentionLayer(d_model, n_heads)
|
||||
for _ in range(n_layers)
|
||||
])
|
||||
self.ffs = nn.ModuleList([
|
||||
Feedforward(d_model)
|
||||
for _ in range(n_layers)
|
||||
])
|
||||
|
||||
def forward(self, x):
|
||||
# x: float32[batch_size, sequence_length, self.d_model]
|
||||
for attention_layer, ff in zip(self.attention_layers, self.ffs):
|
||||
attention_out = attention_layer(x)
|
||||
# attention_out: float32[batch_size, sequence_length, self.d_model]
|
||||
x = F.layer_norm(x + attention_out, x.shape[2:])
|
||||
ff_out = ff(x)
|
||||
# ff_out: float32[batch_size, sequence_length, self.d_model]
|
||||
x = F.layer_norm(x + ff_out, x.shape[2:])
|
||||
return x
|
|
@ -0,0 +1 @@
|
|||
from .main import *
|
86
joint_paraphrase_model/libs/paraphrase_generation/main.py
Normal file
86
joint_paraphrase_model/libs/paraphrase_generation/main.py
Normal file
|
@ -0,0 +1,86 @@
|
|||
import json
|
||||
import math
|
||||
import os
|
||||
|
||||
import random
|
||||
import time
|
||||
|
||||
import matplotlib.pyplot as plt
|
||||
import matplotlib.ticker as ticker
|
||||
import numpy as np
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
import torch.nn.functional as F
|
||||
import torch.optim as optim
|
||||
|
||||
|
||||
class EncoderRNN(nn.Module):
|
||||
def __init__(self, input_size, hidden_size, embeddings):
|
||||
super(EncoderRNN, self).__init__()
|
||||
self.hidden_size = hidden_size
|
||||
|
||||
self.embedding = nn.Embedding.from_pretrained(embeddings)
|
||||
self.gru = nn.GRU(input_size, hidden_size)
|
||||
|
||||
def forward(self, input, hidden):
|
||||
embedded = self.embedding(input).view(1, 1, -1)
|
||||
output = embedded
|
||||
output, hidden = self.gru(output, hidden)
|
||||
return output, hidden
|
||||
|
||||
def initHidden(self):
|
||||
return torch.zeros(1, 1, self.hidden_size)
|
||||
|
||||
class AttnDecoderRNN(nn.Module):
|
||||
def __init__(
|
||||
self,
|
||||
input_size,
|
||||
hidden_size,
|
||||
output_size,
|
||||
embeddings,
|
||||
dropout_p,
|
||||
max_length,
|
||||
):
|
||||
super(AttnDecoderRNN, self).__init__()
|
||||
self.input_size = input_size
|
||||
self.hidden_size = hidden_size
|
||||
self.output_size = output_size
|
||||
self.dropout_p = dropout_p
|
||||
self.max_length = max_length
|
||||
|
||||
self.embedding = nn.Embedding.from_pretrained(embeddings) #for paragen
|
||||
#self.embedding = nn.Embedding(len(embeddings), 300) #for NMT with tamil, trying wiht senitment too
|
||||
self.attn = nn.Linear(self.input_size + self.hidden_size, self.max_length)
|
||||
self.attn_combine = nn.Linear(
|
||||
self.input_size + self.hidden_size, self.hidden_size
|
||||
)
|
||||
self.dropout = nn.Dropout(self.dropout_p)
|
||||
self.gru = nn.GRU(self.hidden_size, self.hidden_size)
|
||||
self.out = nn.Linear(self.hidden_size, self.output_size)
|
||||
|
||||
def forward(self, input, hidden, encoder_outputs, fixations):
|
||||
embedded = self.embedding(input).view(1, 1, -1)
|
||||
embedded = self.dropout(embedded)
|
||||
|
||||
attn_weights = F.softmax(
|
||||
self.attn(torch.cat((embedded[0], hidden[0]), 1)), dim=1
|
||||
)
|
||||
|
||||
attn_weights = attn_weights * torch.nn.ConstantPad1d((0, attn_weights.shape[-1] - fixations.shape[-2]), 0)(fixations.squeeze().unsqueeze(0))
|
||||
|
||||
# attn_weights = torch.softmax(attn_weights * torch.nn.ConstantPad1d((0, attn_weights.shape[-1] - fixations.shape[-2]), 0)(fixations.squeeze().unsqueeze(0)), dim=1)
|
||||
|
||||
attn_applied = torch.bmm(
|
||||
attn_weights.unsqueeze(0), encoder_outputs.unsqueeze(0)
|
||||
)
|
||||
|
||||
output = torch.cat((embedded[0], attn_applied[0]), 1)
|
||||
output = self.attn_combine(output).unsqueeze(0)
|
||||
|
||||
output = F.relu(output)
|
||||
output, hidden = self.gru(output, hidden)
|
||||
|
||||
# output = F.log_softmax(self.out(output[0]), dim=1)
|
||||
output = self.out(output[0])
|
||||
# output = F.log_softmax(output, dim=1)
|
||||
return output, hidden, attn_weights
|
225
joint_paraphrase_model/libs/utils.py
Normal file
225
joint_paraphrase_model/libs/utils.py
Normal file
|
@ -0,0 +1,225 @@
|
|||
import json
|
||||
import logging
|
||||
import math
|
||||
import os
|
||||
import random
|
||||
import re
|
||||
import time
|
||||
|
||||
import matplotlib.pyplot as plt
|
||||
import matplotlib.ticker as ticker
|
||||
from nltk.translate.bleu_score import sentence_bleu
|
||||
import numpy as np
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
|
||||
import config
|
||||
|
||||
|
||||
plt.switch_backend("agg")
|
||||
|
||||
|
||||
def load_glove(vocabulary):
|
||||
logger = logging.getLogger(f"{__name__}.load_glove")
|
||||
logger.info("loading embeddings")
|
||||
try:
|
||||
with open(f"glove.cache") as h:
|
||||
cache = json.load(h)
|
||||
except:
|
||||
logger.info("cache doesn't exist")
|
||||
cache = {}
|
||||
cache[config.PAD] = [0] * 300
|
||||
cache[config.SOS] = [0] * 300
|
||||
cache[config.EOS] = [0] * 300
|
||||
cache[config.UNK] = [0] * 300
|
||||
cache[config.NOFIX] = [0] * 300
|
||||
else:
|
||||
logger.info("cache found")
|
||||
|
||||
cache_miss = False
|
||||
|
||||
if not set(vocabulary) <= set(cache):
|
||||
cache_miss = True
|
||||
logger.warn("cache miss, loading full embeddings")
|
||||
data = {}
|
||||
with open("glove.840B.300d.txt") as h:
|
||||
for line in h:
|
||||
word, *emb = line.strip().split()
|
||||
try:
|
||||
data[word] = [float(x) for x in emb]
|
||||
except:
|
||||
continue
|
||||
logger.info("finished loading full embeddings")
|
||||
for word in vocabulary:
|
||||
try:
|
||||
cache[word] = data[word]
|
||||
except KeyError:
|
||||
cache[word] = [0] * 300
|
||||
logger.info("cache updated")
|
||||
|
||||
embeddings = []
|
||||
for word in vocabulary:
|
||||
embeddings.append(torch.tensor(cache[word], dtype=torch.float32))
|
||||
embeddings = torch.stack(embeddings)
|
||||
|
||||
if cache_miss:
|
||||
with open(f"glove.cache", "w") as h:
|
||||
json.dump(cache, h)
|
||||
logger.info("cache saved")
|
||||
|
||||
return embeddings
|
||||
|
||||
|
||||
def tokenize(s):
|
||||
s = s.lower().strip()
|
||||
s = re.sub(r"([.!?])", r" \1", s)
|
||||
s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
|
||||
s = s.split(" ")
|
||||
return s
|
||||
|
||||
|
||||
def indices_from_sentence(word2index, sentence, unknown_threshold):
|
||||
if unknown_threshold:
|
||||
return [
|
||||
word2index.get(
|
||||
word if random.random() > unknown_threshold else config.UNK,
|
||||
word2index[config.UNK],
|
||||
)
|
||||
for word in sentence
|
||||
]
|
||||
else:
|
||||
return [
|
||||
word2index.get(word, word2index[config.UNK]) for word in sentence
|
||||
]
|
||||
|
||||
|
||||
def tensor_from_sentence(word2index, sentence, unknown_threshold):
|
||||
# indices = [config.SOS]
|
||||
indices = indices_from_sentence(word2index, sentence, unknown_threshold)
|
||||
indices.append(word2index[config.EOS])
|
||||
return torch.tensor(indices, dtype=torch.long, device=config.DEV)
|
||||
|
||||
|
||||
def tensors_from_pair(word2index, pair, shuffle, unknown_threshold):
|
||||
tensors = [
|
||||
tensor_from_sentence(word2index, pair[0], unknown_threshold),
|
||||
tensor_from_sentence(word2index, pair[1], unknown_threshold),
|
||||
]
|
||||
if shuffle:
|
||||
random.shuffle(tensors)
|
||||
return tensors
|
||||
|
||||
|
||||
def bleu(reference, hypothesis, n=4): #not sure if this actually changes the n gram
|
||||
if n < 1:
|
||||
return 0
|
||||
weights = [1/n]*n
|
||||
return sentence_bleu([reference], hypothesis, weights)
|
||||
|
||||
|
||||
def pair_iter(pairs, word2index, shuffle=False, shuffle_pairs=False, unknown_threshold=0.00):
|
||||
if shuffle:
|
||||
pairs = pairs.copy()
|
||||
random.shuffle(pairs)
|
||||
for pair in pairs:
|
||||
tensor1, tensor2 = tensors_from_pair(word2index, (pair[0], pair[1]), shuffle_pairs, unknown_threshold)
|
||||
yield (tensor1,), (tensor2,)
|
||||
|
||||
|
||||
def sent_iter(sents, word2index, unknown_threshold=0.00):
|
||||
for sent in sents:
|
||||
tensor = tensor_from_sentence(word2index, sent, unknown_threshold)
|
||||
yield (tensor,)
|
||||
|
||||
|
||||
def batch_iter(pairs, word2index, batch_size, shuffle=False, unknown_threshold=0.00):
|
||||
for i in range(len(pairs) // batch_size):
|
||||
batch = pairs[i : i + batch_size]
|
||||
if len(batch) != batch_size:
|
||||
continue
|
||||
batch_tensors = [
|
||||
tensors_from_pair(word2index, (pair[0], pair[1]), shuffle, unknown_threshold)
|
||||
for pair in batch
|
||||
]
|
||||
|
||||
tensors1, tensors2 = zip(*batch_tensors)
|
||||
|
||||
# targets = torch.tensor(targets, dtype=torch.long, device=config.DEV)
|
||||
|
||||
# tensors1_lengths = [len(t) for t in tensors1]
|
||||
# tensors2_lengths = [len(t) for t in tensors2]
|
||||
|
||||
# tensors1 = nn.utils.rnn.pack_sequence(tensors1, enforce_sorted=False)
|
||||
# tensors2 = nn.utils.rnn.pack_sequence(tensors2, enforce_sorted=False)
|
||||
|
||||
yield tensors1, tensors2
|
||||
|
||||
|
||||
def asMinutes(s):
|
||||
m = math.floor(s / 60)
|
||||
s -= m * 60
|
||||
return "%dm %ds" % (m, s)
|
||||
|
||||
|
||||
def timeSince(since, percent):
|
||||
now = time.time()
|
||||
s = now - since
|
||||
es = s / (percent)
|
||||
rs = es - s
|
||||
return "%s (- %s)" % (asMinutes(s), asMinutes(rs))
|
||||
|
||||
|
||||
def showPlot(points):
|
||||
plt.figure()
|
||||
fig, ax = plt.subplots()
|
||||
# this locator puts ticks at regular intervals
|
||||
loc = ticker.MultipleLocator(base=0.2)
|
||||
ax.yaxis.set_major_locator(loc)
|
||||
plt.plot(points)
|
||||
|
||||
|
||||
def showAttention(input_sentence, output_words, attentions):
|
||||
# Set up figure with colorbar
|
||||
fig = plt.figure()
|
||||
ax = fig.add_subplot(111)
|
||||
cax = ax.matshow(attentions.numpy(), cmap="bone")
|
||||
fig.colorbar(cax)
|
||||
|
||||
# Set up axes
|
||||
ax.set_xticklabels([""] + input_sentence.split(" ") + ["<__EOS__>"], rotation=90)
|
||||
ax.set_yticklabels([""] + output_words)
|
||||
|
||||
# Show label at every tick
|
||||
ax.xaxis.set_major_locator(ticker.MultipleLocator(1))
|
||||
ax.yaxis.set_major_locator(ticker.MultipleLocator(1))
|
||||
|
||||
plt.show()
|
||||
|
||||
|
||||
def evaluateAndShowAttention(input_sentence):
|
||||
output_words, attentions = evaluate(encoder1, attn_decoder1, input_sentence)
|
||||
print("input =", input_sentence)
|
||||
print("output =", " ".join(output_words))
|
||||
showAttention(input_sentence, output_words, attentions)
|
||||
|
||||
|
||||
def save_model(model, word2index, path):
|
||||
if not path.endswith(".tar"):
|
||||
path += ".tar"
|
||||
torch.save(
|
||||
{"weights": model.state_dict(), "word2index": word2index},
|
||||
path,
|
||||
)
|
||||
|
||||
|
||||
def load_model(path):
|
||||
checkpoint = torch.load(path)
|
||||
return checkpoint["weights"], checkpoint["word2index"]
|
||||
|
||||
|
||||
def extend_vocabulary(word2index, langs):
|
||||
for lang in langs:
|
||||
for word in lang.word2index:
|
||||
if word not in word2index:
|
||||
word2index[word] = len(word2index)
|
||||
return word2index
|
Loading…
Add table
Add a link
Reference in a new issue