Add NLP task models

2020-12-08 21:10:52 +01:00 · 2020-12-08 21:10:52 +01:00 · 69f6de0ace
commit 69f6de0ace
parent d8beb17dfb
46 changed files with 4976 additions and 0 deletions
--- a/joint_paraphrase_model/libs/init.py
+++ b/joint_paraphrase_model/libs/init.py
--- a/joint_paraphrase_model/libs/corpora.py
+++ b/joint_paraphrase_model/libs/corpora.py
@ -0,0 +1,416 @@
+import logging
+
+import config
+
+
+def tokenize(sent):
+    return sent.split(" ")
+
+
+class Lang:
+    """Represents the vocabulary
+    """
+    def __init__(self, name):
+        self.name = name
+        self.word2index = {
+            config.PAD: 0,
+            config.UNK: 1,
+            config.NOFIX: 2,
+            config.SOS: 3,
+            config.EOS: 4,
+        }
+        self.word2count = {}
+        self.index2word = {
+            0: config.PAD,
+            1: config.UNK,
+            2: config.NOFIX,
+            3: config.SOS,
+            4: config.EOS,
+        }
+        self.n_words = 5
+
+    def add_sentence(self, sentence):
+        assert isinstance(
+            sentence, (list, tuple)
+        ), "input to add_sentence must be tokenized"
+        for word in sentence:
+            self.add_word(word)
+
+    def add_word(self, word):
+        if word not in self.word2index:
+            self.word2index[word] = self.n_words
+            self.word2count[word] = 1
+            self.index2word[self.n_words] = word
+            self.n_words += 1
+        else:
+            self.word2count[word] += 1
+
+    def __add__(self, other):
+        """Returns a new Lang object containing the vocabulary from this and
+        the other Lang object
+        """
+        new_lang = Lang(f"{self.name}_{other.name}")
+
+        # Add vocabulary from both Langs
+        for word in self.word2count.keys():
+            new_lang.add_word(word)
+        for word in other.word2count.keys():
+            new_lang.add_word(word)
+
+        # Fix the counts on the new one
+        for word in new_lang.word2count.keys():
+            new_lang.word2count[word] = self.word2count.get(
+                word, 0
+            ) + other.word2count.get(word, 0)
+
+        return new_lang
+
+
+def load_wiki(split):
+    """Load the Wiki from PAWs"""
+    logger = logging.getLogger(f"{__name__}.load_wiki")
+    lang = Lang("wiki")
+
+    if split == "train":
+        path = config.wiki_train_path
+    elif split == "val":
+        path = config.wiki_dev_path
+    elif split == "test":
+        path = config.wiki_test_path
+
+    logger.info("loading %s from %s" % (split, path))
+
+    pairs = []
+    with open(path) as handle:
+
+        # skip header
+        handle.readline()
+
+        for line in handle:
+            _, sent1, sent2, rating = line.strip().split("\t")
+            if rating == "0":
+                continue
+            sent1 = tokenize(sent1)
+            sent2 = tokenize(sent2)
+            lang.add_sentence(sent1)
+            lang.add_sentence(sent2)
+
+            # pairs.append([sent1, sent2, rating])
+            pairs.append([sent1, sent2])
+
+    # MS makes the vocab for paraphrase the same
+    return pairs, lang
+
+
+def load_qqp_paws(split):
+    """Load the QQP from PAWs"""
+    logger = logging.getLogger(f"{__name__}.load_qqp_paws")
+    lang = Lang("qqp_paws")
+
+    if split == "train":
+        path = config.qqp_paws_train_path
+    elif split == "val":
+        path = config.qqp_paws_dev_path
+    elif split == "test":
+        path = config.qqp_paws_test_path
+
+    logger.info("loading %s from %s" % (split, path))
+
+    pairs = []
+    with open(path) as handle:
+
+        # skip header
+        handle.readline()
+
+        for line in handle:
+            _, sent1, sent2, rating = line.strip().split("\t")
+            if rating == "0":
+                continue
+            sent1 = tokenize(sent1)
+            sent2 = tokenize(sent2)
+            lang.add_sentence(sent1)
+            lang.add_sentence(sent2)
+
+            # pairs.append([sent1, sent2, rating])
+            pairs.append([sent1, sent2])
+
+    # MS makes the vocab for paraphrase the same
+    return pairs, lang
+
+def load_qqp(split):
+    """Load the QQP from Original""" 
+    logger = logging.getLogger(f"{__name__}.load_qqp")
+    lang = Lang("qqp")
+
+    if split == "train":
+        path = config.qqp_train_path
+    elif split == "val":
+        path = config.qqp_dev_path
+    elif split == "test":
+        path = config.qqp_test_path
+
+    logger.info("loading %s from %s" % (split, path))
+
+    pairs = []
+    with open(path) as handle:
+
+        # skip header
+        handle.readline()
+
+        for line in handle: 
+            rating, sent1, sent2, _ = line.strip().split("\t")
+            if rating == "0":
+                continue
+            sent1 = tokenize(sent1)
+            sent2 = tokenize(sent2)
+            lang.add_sentence(sent1)
+            lang.add_sentence(sent2)
+
+            # pairs.append([sent1, sent2, rating])
+            pairs.append([sent1, sent2])
+
+    # MS makes the vocab for paraphrase the same
+    return pairs, lang
+
+
+def load_qqp_kag(split):
+    """Load the QQP from Kaggle""" #not original right now, expriemnting with kaggle 100K, 3K, 30K split
+    logger = logging.getLogger(f"{__name__}.load_qqp_kag")
+    lang = Lang("qqp_kag")
+
+    if split == "train":
+        path = config.qqp_kag_train_path
+    elif split == "val":
+        path = config.qqp_kag_dev_path
+    elif split == "test":
+        path = config.qqp_kag_test_path
+
+    logger.info("loading %s from %s" % (split, path))
+
+    pairs = []
+    with open(path) as handle:
+
+        # skip header
+        handle.readline()
+
+        for line in handle: #when reading the kag version we do not have 4 fields, but rather 3
+            rating, sent1, sent2 = line.strip().split("\t")
+            if rating == "0":
+                continue
+            sent1 = tokenize(sent1)
+            sent2 = tokenize(sent2)
+            lang.add_sentence(sent1)
+            lang.add_sentence(sent2)
+
+            # pairs.append([sent1, sent2, rating])
+            pairs.append([sent1, sent2])
+
+    # MS makes the vocab for paraphrase the same
+    return pairs, lang
+
+
+def load_msrpc(split):
+    """Load the Microsoft Research Paraphrase Corpus (MSRPC)"""
+    logger = logging.getLogger(f"{__name__}.load_msrpc")
+    lang = Lang("msrpc")
+
+    if split == "train":
+        path = config.msrpc_train_path
+    elif split == "val":
+        path = config.msrpc_dev_path
+    elif split == "test":
+        path = config.msrpc_test_path
+
+    logger.info("loading %s from %s" % (split, path))
+
+    pairs = []
+    with open(path) as handle:
+
+        # skip header
+        handle.readline()
+
+        for line in handle:
+            rating, _, _, sent1, sent2 = line.strip().split("\t")
+            if rating == "0":
+                continue
+            sent1 = tokenize(sent1)
+            sent2 = tokenize(sent2)
+            lang.add_sentence(sent1)
+            lang.add_sentence(sent2)
+
+            # pairs.append([sent1, sent2, rating])
+            pairs.append([sent1, sent2])
+
+    # return src_lang, dst_lang, pairs
+    # MS makes the vocab for paraphrase the same
+
+    return pairs, lang
+
+def load_sentiment(split):
+    """Load the Sentiment Kaggle Comp Dataset"""
+    logger = logging.getLogger(f"{__name__}.load_sentiment")
+    lang = Lang("sentiment")
+
+    if split == "train":
+        path = config.sentiment_train_path
+    elif split == "val":
+        path = config.sentiment_dev_path
+    elif split == "test":
+        path = config.sentiment_test_path
+   
+    logger.info("loading %s from %s" % (split, path))
+
+    pairs = []
+    
+    with open(path) as handle:
+
+        # skip header
+        handle.readline()
+
+        for line in handle:
+            _, _, sent1, sent2 = line.strip().split("\t")
+
+            sent1 = tokenize(sent1)
+            sent2 = tokenize(sent2)
+            lang.add_sentence(sent1)
+            lang.add_sentence(sent2)
+
+            # pairs.append([sent1, sent2, rating])
+            pairs.append([sent1, sent2])
+
+    return pairs, lang
+
+
+def load_tamil(split):
+    """Load the En to Tamil dataset, current SOTA ~13 bleu"""
+    logger = logging.getLogger(f"{__name__}.load_tamil")
+    lang = Lang("tamil")
+
+    if split == "train":
+        path = config.tamil_train_path
+    elif split == "val":
+        path = config.tamil_dev_path
+    elif split == "test":
+        path = config.tamil_test_path
+
+    logger.info("loading %s from %s" % (split, path))
+
+    pairs = []
+    with open(path) as handle:
+
+        handle.readline()
+
+        for line in handle:
+            sent1, sent2 = line.strip().split("\t")
+            #if rating == "0":
+            #    continue
+            sent1 = tokenize(sent1)
+            #I dunno how to tokenize tamil.....?
+            sent2 = tokenize(sent2)
+            lang.add_sentence(sent1)
+            lang.add_sentence(sent2)
+
+            pairs.append([sent1, sent2])
+
+    return pairs, lang
+
+def load_compression(split):
+    """Load the Google Sentence Compression Dataset"""
+    logger = logging.getLogger(f"{__name__}.load_compression")
+    lang = Lang("compression")
+
+    if split == "train":
+        path = config.compression_train_path
+    elif split == "val":
+        path = config.compression_dev_path 
+    elif split == "test":
+        path = config.compression_test_path
+
+    logger.info("loading %s from %s" % (split, path))
+
+    pairs = []
+    with open(path) as handle:
+
+        handle.readline()
+
+        for line in handle:
+            sent1, sent2 = line.strip().split("\t")
+            sent1 = tokenize(sent1)
+            sent2 = tokenize(sent2)
+           # print(len(sent1), sent1)
+           # print(len(sent2), sent2)
+           # print()
+            lang.add_sentence(sent1)
+            lang.add_sentence(sent2)
+
+            pairs.append([sent1, sent2])
+
+    return pairs, lang
+
+def load_stanford(split):
+    """Load the Stanford Sentiment Dataset phrases"""
+    logger = logging.getLogger(f"{__name__}.load_stanford")
+    lang = Lang("stanford")
+
+    if split == "train":
+        path = config.stanford_train_path
+    elif split == "val":
+        path = config.stanford_dev_path
+    elif split == "test":
+        path = config.stanford_test_path
+
+    logger.info("loading %s from %s" % (split, path))
+
+    pairs = []
+    
+    with open(path) as handle:
+
+        # skip header
+        #handle.readline()
+
+        for line in handle:
+            _, _, sent1, sent2 = line.strip().split("\t")
+
+            sent1 = tokenize(sent1)
+            sent2 = tokenize(sent2)
+            lang.add_sentence(sent1)
+            lang.add_sentence(sent2)
+
+            # pairs.append([sent1, sent2, rating])
+            pairs.append([sent1, sent2])
+
+    return pairs, lang
+
+def load_stanford_sent(split):
+    """Load the Stanford Sentiment Dataset sentences"""
+    logger = logging.getLogger(f"{__name__}.load_stanford_sent")
+    lang = Lang("stanford_sent")
+
+    if split == "train":
+        path = config.stanford_sent_train_path
+    elif split == "val":
+        path = config.stanford_sent_dev_path
+    elif split == "test":
+        path = config.stanford_sent_test_path
+
+    logger.info("loading %s from %s" % (split, path))
+
+    pairs = []
+
+    with open(path) as handle:
+
+        # skip header
+        #handle.readline()
+
+        for line in handle:
+            _, _, sent1, sent2 = line.strip().split("\t")
+
+            sent1 = tokenize(sent1)
+            sent2 = tokenize(sent2)
+            lang.add_sentence(sent1)
+            lang.add_sentence(sent2)
+
+            # pairs.append([sent1, sent2, rating])
+            pairs.append([sent1, sent2])
+
+    return pairs, lang
--- a/joint_paraphrase_model/libs/fixation_generation/init.py
+++ b/joint_paraphrase_model/libs/fixation_generation/init.py
@ -0,0 +1 @@
+from .main import *
--- a/joint_paraphrase_model/libs/fixation_generation/main.py
+++ b/joint_paraphrase_model/libs/fixation_generation/main.py
@ -0,0 +1,131 @@
+from collections import OrderedDict
+import logging
+import sys
+
+from .self_attention import Transformer
+
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.nn.utils.rnn import pack_sequence, pack_padded_sequence, pad_packed_sequence
+
+
+def random_embedding(vocab_size, embedding_dim):
+    pretrain_emb = np.empty([vocab_size, embedding_dim])
+    scale = np.sqrt(3.0 / embedding_dim)
+    for index in range(vocab_size):
+        pretrain_emb[index, :] = np.random.uniform(-scale, scale, [1, embedding_dim])
+    return pretrain_emb
+
+
+def neg_log_likelihood_loss(outputs, batch_label, batch_size, seq_len):
+    outputs = outputs.view(batch_size * seq_len, -1)
+    score = F.log_softmax(outputs, 1)
+
+    loss = nn.NLLLoss(ignore_index=0, size_average=False)(
+        score, batch_label.view(batch_size * seq_len)
+    )
+    loss = loss / batch_size
+    _, tag_seq = torch.max(score, 1)
+    tag_seq = tag_seq.view(batch_size, seq_len)
+
+    # print(score[0], tag_seq[0])
+
+    return loss, tag_seq
+
+
+def mse_loss(outputs, batch_label, batch_size, seq_len, word_seq_length):
+    # score = torch.nn.functional.softmax(outputs, 1)
+    score = torch.sigmoid(outputs)
+
+    mask = torch.zeros_like(score)
+    for i, v in enumerate(word_seq_length):
+        mask[i, 0:v] = 1
+
+    score = score * mask
+
+    loss = nn.MSELoss(reduction="sum")(
+        score.view(batch_size, seq_len), batch_label.view(batch_size, seq_len)
+    )
+
+    loss = loss / batch_size
+
+    return loss, score.view(batch_size, seq_len)
+
+
+class Network(nn.Module):
+    def __init__(
+        self,
+        embedding_type,
+        vocab_size,
+        embedding_dim,
+        dropout,
+        hidden_dim,
+        embeddings=None,
+        attention=True,
+    ):
+        super().__init__()
+        self.logger = logging.getLogger(f"{__name__}")
+        prelayers = OrderedDict()
+        postlayers = OrderedDict()
+
+        if embedding_type in ("w2v", "glove"):
+            if embeddings is not None:
+                prelayers["embedding_layer"] = nn.Embedding.from_pretrained(embeddings)
+            else:
+                prelayers["embedding_layer"] = nn.Embedding(vocab_size, embedding_dim)
+            prelayers["embedding_dropout_layer"] = nn.Dropout(dropout)
+            embedding_dim = 300
+        elif embedding_type == "bert":
+            embedding_dim = 768
+
+        self.lstm = BiLSTM(embedding_dim, hidden_dim // 2, num_layers=1)
+        postlayers["lstm_dropout_layer"] = nn.Dropout(dropout)
+
+        if attention:
+            # increased compl with 1024D, and 16,16: for no att and att experiments
+            # before: for the initial att and pretraining: heads 4 and layers 4, 128D
+            # then was 128 D with heads 4 layer 1 = results for all IUI
+            ###postlayers["position_encodings"] = PositionalEncoding(hidden_dim)
+            postlayers["attention_layer"] = Transformer(
+                d_model=hidden_dim, n_heads=4, n_layers=1
+            )
+
+        postlayers["ff_layer"] = nn.Linear(hidden_dim, hidden_dim // 2)
+        postlayers["ff_activation"] = nn.ReLU()
+        postlayers["output_layer"] = nn.Linear(hidden_dim // 2, 1)
+
+        self.logger.info(f"prelayers: {prelayers.keys()}")
+        self.logger.info(f"postlayers: {postlayers.keys()}")
+
+        self.pre = nn.Sequential(prelayers)
+        self.post = nn.Sequential(postlayers)
+
+    def forward(self, x, word_seq_length):
+        x = self.pre(x)
+        x = self.lstm(x, word_seq_length)
+        #MS pritning fix model params
+        #for p in self.parameters():
+        #    print(p.data)
+        #    break
+
+        return self.post(x.transpose(1, 0))
+
+
+class BiLSTM(nn.Module):
+    def __init__(self, embedding_dim, lstm_hidden, num_layers):
+        super().__init__()
+        self.net = nn.LSTM(
+            input_size=embedding_dim,
+            hidden_size=lstm_hidden,
+            num_layers=num_layers,
+            batch_first=True,
+            bidirectional=True,
+        )
+
+    def forward(self, x, word_seq_length):
+        packed_words = pack_padded_sequence(x, word_seq_length, True, False)
+        lstm_out, hidden = self.net(packed_words)
+        lstm_out, _ = pad_packed_sequence(lstm_out)
+        return lstm_out
--- a/joint_paraphrase_model/libs/fixation_generation/self_attention.py
+++ b/joint_paraphrase_model/libs/fixation_generation/self_attention.py
@ -0,0 +1,131 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+import numpy as np
+
+import math
+
+
+class PositionalEncoding(nn.Module):
+    def __init__(self, d_hid, n_position=200):
+        super(PositionalEncoding, self).__init__()
+
+        # Not a parameter
+        self.register_buffer('pos_table', self._get_sinusoid_encoding_table(n_position, d_hid))
+
+    def _get_sinusoid_encoding_table(self, n_position, d_hid):
+        ''' Sinusoid position encoding table '''
+        # TODO: make it with torch instead of numpy
+
+        def get_position_angle_vec(position):
+            return [position / np.power(10000, 2 * (hid_j // 2) / d_hid) for hid_j in range(d_hid)]
+
+        sinusoid_table = np.array([get_position_angle_vec(pos_i) for pos_i in range(n_position)])
+        sinusoid_table[:, 0::2] = np.sin(sinusoid_table[:, 0::2])  # dim 2i
+        sinusoid_table[:, 1::2] = np.cos(sinusoid_table[:, 1::2])  # dim 2i+1
+
+        return torch.FloatTensor(sinusoid_table).unsqueeze(0)
+
+    def forward(self, x):
+        return x + self.pos_table[:, :x.size(1)].clone().detach()
+
+
+class AttentionLayer(nn.Module):
+    def __init__(self):
+        super(AttentionLayer, self).__init__()
+        
+    def forward(self, Q, K, V):
+        # Q: float32:[batch_size, n_queries, d_k]
+        # K: float32:[batch_size, n_keys, d_k]
+        # V: float32:[batch_size, n_keys, d_v]
+        dk = K.shape[-1]
+        dv = V.shape[-1]
+        KT = torch.transpose(K, -1, -2)
+        weight_logits = torch.bmm(Q, KT) / math.sqrt(dk)
+        # weight_logits: float32[batch_size, n_queries, n_keys]
+        weights = F.softmax(weight_logits, dim=-1)
+        # weight: float32[batch_size, n_queries, n_keys]
+        return torch.bmm(weights, V)
+        # return float32[batch_size, n_queries, dv]
+        
+
+class MultiHeadedSelfAttentionLayer(nn.Module):
+    def __init__(self, d_model, n_heads):
+        super(MultiHeadedSelfAttentionLayer, self).__init__()
+        self.d_model = d_model
+        self.n_heads = n_heads
+        print('{} {}'.format(d_model, n_heads))
+        assert d_model % n_heads == 0
+        self.d_k = d_model // n_heads
+        self.d_v = self.d_k
+        self.attention_layer = AttentionLayer()
+        self.W_Qs = nn.ModuleList([
+                nn.Linear(d_model, self.d_k, bias=False)
+                for _ in range(n_heads)
+        ])
+        self.W_Ks = nn.ModuleList([
+                nn.Linear(d_model, self.d_k, bias=False)
+                for _ in range(n_heads)
+        ])
+        self.W_Vs = nn.ModuleList([
+                nn.Linear(d_model, self.d_v, bias=False)
+                for _ in range(n_heads)
+        ])
+        self.W_O = nn.Linear(d_model, d_model, bias=False)
+    
+    def forward(self, x):
+        # x:float32[batch_size, sequence_length, self.d_model]
+        head_outputs = []
+        for W_Q, W_K, W_V in zip(self.W_Qs, self.W_Ks, self.W_Vs):
+            Q = W_Q(x)
+            # Q float32:[batch_size, sequence_length, self.d_k]
+            K = W_K(x)
+            # Q float32:[batch_size, sequence_length, self.d_k]
+            V = W_V(x)
+            # Q float32:[batch_size, sequence_length, self.d_v]
+            head_output = self.attention_layer(Q, K, V)
+            # float32:[batch_size, sequence_length, self.d_v]
+            head_outputs.append(head_output)
+        concatenated = torch.cat(head_outputs, dim=-1)
+        # concatenated float32:[batch_size, sequence_length, self.d_model]
+        out = self.W_O(concatenated)
+        # out float32:[batch_size, sequence_length, self.d_model]
+        return out
+
+class Feedforward(nn.Module):
+    def __init__(self, d_model):
+        super(Feedforward, self).__init__()
+        self.d_model = d_model
+        self.W1 = nn.Linear(d_model, d_model)
+        self.W2 = nn.Linear(d_model, d_model)
+        
+    def forward(self, x):
+        # x: float32[batch_size, sequence_length, d_model]
+        return self.W2(torch.relu(self.W1(x)))
+
+class Transformer(nn.Module):
+    def __init__(self, d_model, n_heads, n_layers):
+        super(Transformer, self).__init__()
+        self.d_model = d_model
+        self.n_heads = n_heads
+        self.n_layers = n_layers
+        self.attention_layers = nn.ModuleList([
+            MultiHeadedSelfAttentionLayer(d_model, n_heads)
+            for _ in range(n_layers)
+        ])
+        self.ffs = nn.ModuleList([
+            Feedforward(d_model)
+            for _ in range(n_layers)
+        ])
+        
+    def forward(self, x):
+        # x: float32[batch_size, sequence_length, self.d_model]
+        for attention_layer, ff in zip(self.attention_layers, self.ffs):
+            attention_out = attention_layer(x)
+            # attention_out: float32[batch_size, sequence_length, self.d_model]
+            x = F.layer_norm(x + attention_out, x.shape[2:])
+            ff_out = ff(x)
+            # ff_out: float32[batch_size, sequence_length, self.d_model]
+            x = F.layer_norm(x + ff_out, x.shape[2:])
+        return x
--- a/joint_paraphrase_model/libs/paraphrase_generation/init.py
+++ b/joint_paraphrase_model/libs/paraphrase_generation/init.py
@ -0,0 +1 @@
+from .main import *
--- a/joint_paraphrase_model/libs/paraphrase_generation/main.py
+++ b/joint_paraphrase_model/libs/paraphrase_generation/main.py
@ -0,0 +1,86 @@
+import json
+import math
+import os
+
+import random
+import time
+
+import matplotlib.pyplot as plt
+import matplotlib.ticker as ticker
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.optim as optim
+
+
+class EncoderRNN(nn.Module):
+    def __init__(self, input_size, hidden_size, embeddings):
+        super(EncoderRNN, self).__init__()
+        self.hidden_size = hidden_size
+
+        self.embedding = nn.Embedding.from_pretrained(embeddings)
+        self.gru = nn.GRU(input_size, hidden_size)
+
+    def forward(self, input, hidden):
+        embedded = self.embedding(input).view(1, 1, -1)
+        output = embedded
+        output, hidden = self.gru(output, hidden)
+        return output, hidden
+
+    def initHidden(self):
+        return torch.zeros(1, 1, self.hidden_size)
+
+class AttnDecoderRNN(nn.Module):
+    def __init__(
+        self,
+        input_size,
+        hidden_size,
+        output_size,
+        embeddings,
+        dropout_p,
+        max_length,
+    ):
+        super(AttnDecoderRNN, self).__init__()
+        self.input_size = input_size
+        self.hidden_size = hidden_size
+        self.output_size = output_size
+        self.dropout_p = dropout_p
+        self.max_length = max_length
+
+        self.embedding = nn.Embedding.from_pretrained(embeddings) #for paragen
+        #self.embedding = nn.Embedding(len(embeddings), 300) #for NMT with tamil, trying wiht senitment too
+        self.attn = nn.Linear(self.input_size + self.hidden_size, self.max_length)
+        self.attn_combine = nn.Linear(
+            self.input_size + self.hidden_size, self.hidden_size
+        )
+        self.dropout = nn.Dropout(self.dropout_p)
+        self.gru = nn.GRU(self.hidden_size, self.hidden_size)
+        self.out = nn.Linear(self.hidden_size, self.output_size)
+
+    def forward(self, input, hidden, encoder_outputs, fixations):
+        embedded = self.embedding(input).view(1, 1, -1)
+        embedded = self.dropout(embedded)
+
+        attn_weights = F.softmax(
+            self.attn(torch.cat((embedded[0], hidden[0]), 1)), dim=1
+        )
+
+        attn_weights = attn_weights * torch.nn.ConstantPad1d((0, attn_weights.shape[-1] - fixations.shape[-2]), 0)(fixations.squeeze().unsqueeze(0))
+
+        # attn_weights = torch.softmax(attn_weights * torch.nn.ConstantPad1d((0, attn_weights.shape[-1] - fixations.shape[-2]), 0)(fixations.squeeze().unsqueeze(0)), dim=1)
+        
+        attn_applied = torch.bmm(
+            attn_weights.unsqueeze(0), encoder_outputs.unsqueeze(0)
+        )
+
+        output = torch.cat((embedded[0], attn_applied[0]), 1)
+        output = self.attn_combine(output).unsqueeze(0)
+
+        output = F.relu(output)
+        output, hidden = self.gru(output, hidden)
+
+        # output = F.log_softmax(self.out(output[0]), dim=1)
+        output = self.out(output[0])
+        # output = F.log_softmax(output, dim=1)
+        return output, hidden, attn_weights
--- a/joint_paraphrase_model/libs/utils.py
+++ b/joint_paraphrase_model/libs/utils.py
@ -0,0 +1,225 @@
+import json
+import logging
+import math
+import os
+import random
+import re
+import time
+
+import matplotlib.pyplot as plt
+import matplotlib.ticker as ticker
+from nltk.translate.bleu_score import sentence_bleu
+import numpy as np
+import torch
+import torch.nn as nn
+
+import config
+
+
+plt.switch_backend("agg")
+
+
+def load_glove(vocabulary):
+    logger = logging.getLogger(f"{__name__}.load_glove")
+    logger.info("loading embeddings")
+    try:
+        with open(f"glove.cache") as h:
+            cache = json.load(h)
+    except:
+        logger.info("cache doesn't exist")
+        cache = {}
+        cache[config.PAD] = [0] * 300
+        cache[config.SOS] = [0] * 300
+        cache[config.EOS] = [0] * 300
+        cache[config.UNK] = [0] * 300
+        cache[config.NOFIX] = [0] * 300
+    else:
+        logger.info("cache found")
+
+    cache_miss = False
+
+    if not set(vocabulary) <= set(cache):
+        cache_miss = True
+        logger.warn("cache miss, loading full embeddings")
+        data = {}
+        with open("glove.840B.300d.txt") as h:
+            for line in h:
+                word, *emb = line.strip().split()
+                try:
+                    data[word] = [float(x) for x in emb]
+                except:
+                    continue
+        logger.info("finished loading full embeddings")
+        for word in vocabulary:
+            try:
+                cache[word] = data[word]
+            except KeyError:
+                cache[word] = [0] * 300
+        logger.info("cache updated")
+
+    embeddings = []
+    for word in vocabulary:
+        embeddings.append(torch.tensor(cache[word], dtype=torch.float32))
+    embeddings = torch.stack(embeddings)
+
+    if cache_miss:
+        with open(f"glove.cache", "w") as h:
+            json.dump(cache, h)
+        logger.info("cache saved")
+
+    return embeddings
+
+
+def tokenize(s):
+    s = s.lower().strip()
+    s = re.sub(r"([.!?])", r" \1", s)
+    s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
+    s = s.split(" ")
+    return s
+
+
+def indices_from_sentence(word2index, sentence, unknown_threshold):
+    if unknown_threshold:
+        return [
+            word2index.get(
+                word if random.random() > unknown_threshold else config.UNK,
+                word2index[config.UNK],
+            )
+            for word in sentence
+        ]
+    else:
+        return [
+            word2index.get(word, word2index[config.UNK]) for word in sentence
+        ]
+
+
+def tensor_from_sentence(word2index, sentence, unknown_threshold):
+    # indices = [config.SOS]
+    indices = indices_from_sentence(word2index, sentence, unknown_threshold)
+    indices.append(word2index[config.EOS])
+    return torch.tensor(indices, dtype=torch.long, device=config.DEV)
+
+
+def tensors_from_pair(word2index, pair, shuffle, unknown_threshold):
+    tensors = [
+        tensor_from_sentence(word2index, pair[0], unknown_threshold),
+        tensor_from_sentence(word2index, pair[1], unknown_threshold),
+    ]
+    if shuffle:
+        random.shuffle(tensors)
+    return tensors
+
+
+def bleu(reference, hypothesis, n=4): #not sure if this actually changes the n gram
+    if n < 1:
+        return 0
+    weights = [1/n]*n
+    return sentence_bleu([reference], hypothesis, weights)
+
+
+def pair_iter(pairs, word2index, shuffle=False, shuffle_pairs=False, unknown_threshold=0.00):
+    if shuffle:
+        pairs = pairs.copy()
+        random.shuffle(pairs)
+    for pair in pairs:
+        tensor1, tensor2 = tensors_from_pair(word2index, (pair[0], pair[1]), shuffle_pairs, unknown_threshold)
+        yield (tensor1,), (tensor2,)
+
+
+def sent_iter(sents, word2index, unknown_threshold=0.00):
+    for sent in sents:
+        tensor = tensor_from_sentence(word2index, sent, unknown_threshold)
+        yield (tensor,)
+
+
+def batch_iter(pairs, word2index, batch_size, shuffle=False, unknown_threshold=0.00):
+    for i in range(len(pairs) // batch_size):
+        batch = pairs[i : i + batch_size]
+        if len(batch) != batch_size:
+            continue
+        batch_tensors = [
+            tensors_from_pair(word2index, (pair[0], pair[1]), shuffle, unknown_threshold)
+            for pair in batch
+        ]
+
+        tensors1, tensors2 = zip(*batch_tensors)
+
+        # targets = torch.tensor(targets, dtype=torch.long, device=config.DEV)
+
+        # tensors1_lengths = [len(t) for t in tensors1]
+        # tensors2_lengths = [len(t) for t in tensors2]
+
+        # tensors1 = nn.utils.rnn.pack_sequence(tensors1, enforce_sorted=False)
+        # tensors2 = nn.utils.rnn.pack_sequence(tensors2, enforce_sorted=False)
+
+        yield tensors1, tensors2
+
+
+def asMinutes(s):
+    m = math.floor(s / 60)
+    s -= m * 60
+    return "%dm %ds" % (m, s)
+
+
+def timeSince(since, percent):
+    now = time.time()
+    s = now - since
+    es = s / (percent)
+    rs = es - s
+    return "%s (- %s)" % (asMinutes(s), asMinutes(rs))
+
+
+def showPlot(points):
+    plt.figure()
+    fig, ax = plt.subplots()
+    # this locator puts ticks at regular intervals
+    loc = ticker.MultipleLocator(base=0.2)
+    ax.yaxis.set_major_locator(loc)
+    plt.plot(points)
+
+
+def showAttention(input_sentence, output_words, attentions):
+    # Set up figure with colorbar
+    fig = plt.figure()
+    ax = fig.add_subplot(111)
+    cax = ax.matshow(attentions.numpy(), cmap="bone")
+    fig.colorbar(cax)
+
+    # Set up axes
+    ax.set_xticklabels([""] + input_sentence.split(" ") + ["<__EOS__>"], rotation=90)
+    ax.set_yticklabels([""] + output_words)
+
+    # Show label at every tick
+    ax.xaxis.set_major_locator(ticker.MultipleLocator(1))
+    ax.yaxis.set_major_locator(ticker.MultipleLocator(1))
+
+    plt.show()
+
+
+def evaluateAndShowAttention(input_sentence):
+    output_words, attentions = evaluate(encoder1, attn_decoder1, input_sentence)
+    print("input =", input_sentence)
+    print("output =", " ".join(output_words))
+    showAttention(input_sentence, output_words, attentions)
+
+
+def save_model(model, word2index, path):
+    if not path.endswith(".tar"):
+        path += ".tar"
+    torch.save(
+        {"weights": model.state_dict(), "word2index": word2index},
+        path,
+    )
+
+
+def load_model(path):
+    checkpoint = torch.load(path)
+    return checkpoint["weights"], checkpoint["word2index"]
+
+
+def extend_vocabulary(word2index, langs):
+    for lang in langs:
+        for word in lang.word2index:
+            if word not in word2index:
+                word2index[word] = len(word2index)
+    return word2index