Add NLP task models

2020-12-08 21:10:52 +01:00 · 2020-12-08 21:10:52 +01:00 · 69f6de0ace
commit 69f6de0ace
parent d8beb17dfb
46 changed files with 4976 additions and 0 deletions
--- a/joint_sentence_compression_model/libs/init.py
+++ b/joint_sentence_compression_model/libs/init.py
--- a/joint_sentence_compression_model/libs/corpora.py
+++ b/joint_sentence_compression_model/libs/corpora.py
@ -0,0 +1,97 @@
+import logging
+
+import config
+
+
+def tokenize(sent):
+    return sent.split(" ")
+
+
+class Lang:
+    """Represents the vocabulary
+    """
+    def __init__(self, name):
+        self.name = name
+        self.word2index = {
+            config.PAD: 0,
+            config.UNK: 1,
+        }
+        self.word2count = {}
+        self.index2word = {
+            0: config.PAD,
+            1: config.UNK,
+        }
+        self.n_words = 2
+
+    def add_sentence(self, sentence):
+        assert isinstance(
+            sentence, (list, tuple)
+        ), "input to add_sentence must be tokenized"
+        for word in sentence:
+            self.add_word(word)
+
+    def add_word(self, word):
+        if word not in self.word2index:
+            self.word2index[word] = self.n_words
+            self.word2count[word] = 1
+            self.index2word[self.n_words] = word
+            self.n_words += 1
+        else:
+            self.word2count[word] += 1
+
+    def __add__(self, other):
+        """Returns a new Lang object containing the vocabulary from this and
+        the other Lang object
+        """
+        new_lang = Lang(f"{self.name}_{other.name}")
+
+        # Add vocabulary from both Langs
+        for word in self.word2count.keys():
+            new_lang.add_word(word)
+        for word in other.word2count.keys():
+            new_lang.add_word(word)
+
+        # Fix the counts on the new one
+        for word in new_lang.word2count.keys():
+            new_lang.word2count[word] = self.word2count.get(
+                word, 0
+            ) + other.word2count.get(word, 0)
+
+        return new_lang
+
+
+def load_google(split, max_len=None):
+    """Load the Google Sentence Compression Dataset"""
+    logger = logging.getLogger(f"{__name__}.load_compression")
+    lang = Lang("compression")
+
+    if split == "train":
+        path = config.google_train_path
+    elif split == "val":
+        path = config.google_dev_path 
+    elif split == "test":
+        path = config.google_test_path
+
+    logger.info("loading %s from %s" % (split, path))
+
+    data = []
+    sent = []
+    mask = []
+    with open(path) as handle:
+        for line in handle:
+            line = line.strip()
+            if line:
+                w, d = line.split("\t")
+                sent.append(w)
+                mask.append(int(d))
+            else:
+                if sent and (max_len is None or len(sent) <= max_len):
+                    data.append([sent, mask])
+                    lang.add_sentence(sent)
+                sent = []
+                mask = []
+        if sent:
+            data.append([tuple(sent), tuple(mask)])
+            lang.add_sentence(sent)
+
+    return data, lang
--- a/joint_sentence_compression_model/libs/fixation_generation/init.py
+++ b/joint_sentence_compression_model/libs/fixation_generation/init.py
@ -0,0 +1 @@
+from .main import *
--- a/joint_sentence_compression_model/libs/fixation_generation/main.py
+++ b/joint_sentence_compression_model/libs/fixation_generation/main.py
@ -0,0 +1,125 @@
+from collections import OrderedDict
+import logging
+import sys
+
+from .self_attention import Transformer
+
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.nn.utils.rnn import pack_sequence, pack_padded_sequence, pad_packed_sequence, pad_sequence
+
+
+def random_embedding(vocab_size, embedding_dim):
+    pretrain_emb = np.empty([vocab_size, embedding_dim])
+    scale = np.sqrt(3.0 / embedding_dim)
+    for index in range(vocab_size):
+        pretrain_emb[index, :] = np.random.uniform(-scale, scale, [1, embedding_dim])
+    return pretrain_emb
+
+
+def neg_log_likelihood_loss(outputs, batch_label, batch_size, seq_len):
+    outputs = outputs.view(batch_size * seq_len, -1)
+    score = F.log_softmax(outputs, 1)
+
+    loss = nn.NLLLoss(ignore_index=0, size_average=False)(
+        score, batch_label.view(batch_size * seq_len)
+    )
+    loss = loss / batch_size
+    _, tag_seq = torch.max(score, 1)
+    tag_seq = tag_seq.view(batch_size, seq_len)
+
+    return loss, tag_seq
+
+
+def mse_loss(outputs, batch_label, batch_size, seq_len, word_seq_length):
+    score = torch.sigmoid(outputs)
+
+    mask = torch.zeros_like(score)
+    for i, v in enumerate(word_seq_length):
+        mask[i, 0:v] = 1
+
+    score = score * mask
+
+    loss = nn.MSELoss(reduction="sum")(
+        score.view(batch_size, seq_len), batch_label.view(batch_size, seq_len)
+    )
+
+    loss = loss / batch_size
+
+    return loss, score.view(batch_size, seq_len)
+
+
+class Network(nn.Module):
+    def __init__(
+        self,
+        embedding_type,
+        vocab_size,
+        embedding_dim,
+        dropout,
+        hidden_dim,
+        embeddings=None,
+        attention=True,
+    ):
+        super().__init__()
+        self.logger = logging.getLogger(f"{__name__}")
+        self.attention = attention
+        prelayers = OrderedDict()
+        postlayers = OrderedDict()
+
+        if embedding_type in ("w2v", "glove"):
+            if embeddings is not None:
+                prelayers["embedding_layer"] = nn.Embedding.from_pretrained(embeddings, freeze=True)
+            else:
+                prelayers["embedding_layer"] = nn.Embedding(vocab_size, embedding_dim)
+            prelayers["embedding_dropout_layer"] = nn.Dropout(dropout)
+            embedding_dim = 300
+        elif embedding_type == "bert":
+            embedding_dim = 768
+
+        self.lstm = BiLSTM(embedding_dim, hidden_dim // 2, num_layers=1)
+        postlayers["lstm_dropout_layer"] = nn.Dropout(dropout)
+
+        if self.attention:
+            postlayers["attention_layer"] = Transformer(
+                d_model=hidden_dim, n_heads=4, n_layers=1
+            )
+
+        postlayers["ff_layer"] = nn.Linear(hidden_dim, hidden_dim // 2)
+        postlayers["ff_activation"] = nn.ReLU()
+        postlayers["output_layer"] = nn.Linear(hidden_dim // 2, 1)
+
+        self.logger.info(f"prelayers: {prelayers.keys()}")
+        self.logger.info(f"postlayers: {postlayers.keys()}")
+
+        self.pre = nn.Sequential(prelayers)
+        self.post = nn.Sequential(postlayers)
+
+    def forward(self, x, word_seq_length):
+        x = self.pre(x)
+        x = self.lstm(x, word_seq_length)
+
+        output = []
+        for _x, l in zip(x.transpose(1, 0), word_seq_length):
+            output.append(self.post(_x[:l].unsqueeze(0))[0])
+
+        return pad_sequence(output, batch_first=True)
+
+
+class BiLSTM(nn.Module):
+    def __init__(self, embedding_dim, lstm_hidden, num_layers):
+        super().__init__()
+        self.net = nn.LSTM(
+            input_size=embedding_dim,
+            hidden_size=lstm_hidden,
+            num_layers=num_layers,
+            batch_first=True,
+            bidirectional=True,
+        )
+
+    def forward(self, x, word_seq_length):
+        packed_words = pack_padded_sequence(x, word_seq_length, True, False)
+        lstm_out, hidden = self.net(packed_words)
+        lstm_out, _ = pad_packed_sequence(lstm_out)
+        return lstm_out
--- a/joint_sentence_compression_model/libs/fixation_generation/self_attention.py
+++ b/joint_sentence_compression_model/libs/fixation_generation/self_attention.py
@ -0,0 +1,128 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+import numpy as np
+
+import math
+
+
+class PositionalEncoding(nn.Module):
+    def __init__(self, d_hid, n_position=200):
+        super(PositionalEncoding, self).__init__()
+
+        self.register_buffer('pos_table', self._get_sinusoid_encoding_table(n_position, d_hid))
+
+    def _get_sinusoid_encoding_table(self, n_position, d_hid):
+        ''' Sinusoid position encoding table '''
+        def get_position_angle_vec(position):
+            return [position / np.power(10000, 2 * (hid_j // 2) / d_hid) for hid_j in range(d_hid)]
+
+        sinusoid_table = np.array([get_position_angle_vec(pos_i) for pos_i in range(n_position)])
+        sinusoid_table[:, 0::2] = np.sin(sinusoid_table[:, 0::2])  # dim 2i
+        sinusoid_table[:, 1::2] = np.cos(sinusoid_table[:, 1::2])  # dim 2i+1
+
+        return torch.FloatTensor(sinusoid_table).unsqueeze(0)
+
+    def forward(self, x):
+        return x + self.pos_table[:, :x.size(1)].clone().detach()
+
+
+class AttentionLayer(nn.Module):
+    def __init__(self):
+        super(AttentionLayer, self).__init__()
+        
+    def forward(self, Q, K, V):
+        # Q: float32:[batch_size, n_queries, d_k]
+        # K: float32:[batch_size, n_keys, d_k]
+        # V: float32:[batch_size, n_keys, d_v]
+        dk = K.shape[-1]
+        dv = V.shape[-1]
+        KT = torch.transpose(K, -1, -2)
+        weight_logits = torch.bmm(Q, KT) / math.sqrt(dk)
+        # weight_logits: float32[batch_size, n_queries, n_keys]
+        weights = F.softmax(weight_logits, dim=-1)
+        # weight: float32[batch_size, n_queries, n_keys]
+        return torch.bmm(weights, V)
+        # return float32[batch_size, n_queries, dv]
+        
+
+class MultiHeadedSelfAttentionLayer(nn.Module):
+    def __init__(self, d_model, n_heads):
+        super(MultiHeadedSelfAttentionLayer, self).__init__()
+        self.d_model = d_model
+        self.n_heads = n_heads
+        print('{} {}'.format(d_model, n_heads))
+        assert d_model % n_heads == 0
+        self.d_k = d_model // n_heads
+        self.d_v = self.d_k
+        self.attention_layer = AttentionLayer()
+        self.W_Qs = nn.ModuleList([
+                nn.Linear(d_model, self.d_k, bias=False)
+                for _ in range(n_heads)
+        ])
+        self.W_Ks = nn.ModuleList([
+                nn.Linear(d_model, self.d_k, bias=False)
+                for _ in range(n_heads)
+        ])
+        self.W_Vs = nn.ModuleList([
+                nn.Linear(d_model, self.d_v, bias=False)
+                for _ in range(n_heads)
+        ])
+        self.W_O = nn.Linear(d_model, d_model, bias=False)
+    
+    def forward(self, x):
+        # x:float32[batch_size, sequence_length, self.d_model]
+        head_outputs = []
+        for W_Q, W_K, W_V in zip(self.W_Qs, self.W_Ks, self.W_Vs):
+            Q = W_Q(x)
+            # Q float32:[batch_size, sequence_length, self.d_k]
+            K = W_K(x)
+            # Q float32:[batch_size, sequence_length, self.d_k]
+            V = W_V(x)
+            # Q float32:[batch_size, sequence_length, self.d_v]
+            head_output = self.attention_layer(Q, K, V)
+            # float32:[batch_size, sequence_length, self.d_v]
+            head_outputs.append(head_output)
+        concatenated = torch.cat(head_outputs, dim=-1)
+        # concatenated float32:[batch_size, sequence_length, self.d_model]
+        out = self.W_O(concatenated)
+        # out float32:[batch_size, sequence_length, self.d_model]
+        return out
+
+class Feedforward(nn.Module):
+    def __init__(self, d_model):
+        super(Feedforward, self).__init__()
+        self.d_model = d_model
+        self.W1 = nn.Linear(d_model, d_model)
+        self.W2 = nn.Linear(d_model, d_model)
+        
+    def forward(self, x):
+        # x: float32[batch_size, sequence_length, d_model]
+        return self.W2(torch.relu(self.W1(x)))
+
+class Transformer(nn.Module):
+    def __init__(self, d_model, n_heads, n_layers):
+        super(Transformer, self).__init__()
+        self.d_model = d_model
+        self.n_heads = n_heads
+        self.n_layers = n_layers
+        self.attention_layers = nn.ModuleList([
+            MultiHeadedSelfAttentionLayer(d_model, n_heads)
+            for _ in range(n_layers)
+        ])
+        self.ffs = nn.ModuleList([
+            Feedforward(d_model)
+            for _ in range(n_layers)
+        ])
+        
+    def forward(self, x):
+        # x: float32[batch_size, sequence_length, self.d_model]
+        for attention_layer, ff in zip(self.attention_layers, self.ffs):
+            attention_out = attention_layer(x)
+            # attention_out: float32[batch_size, sequence_length, self.d_model]
+            x = F.layer_norm(x + attention_out, x.shape[2:])
+            ff_out = ff(x)
+            # ff_out: float32[batch_size, sequence_length, self.d_model]
+            x = F.layer_norm(x + ff_out, x.shape[2:])
+        return x
--- a/joint_sentence_compression_model/libs/sentence_compression/LICENSE
+++ b/joint_sentence_compression_model/libs/sentence_compression/LICENSE
@ -0,0 +1,29 @@
+BSD 3-Clause License
+
+Copyright (c) 2018, Tatsuya Aoki 
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+* Redistributions of source code must retain the above copyright notice, this
+  list of conditions and the following disclaimer.
+
+* Redistributions in binary form must reproduce the above copyright notice,
+  this list of conditions and the following disclaimer in the documentation
+  and/or other materials provided with the distribution.
+
+* Neither the name of the copyright holder nor the names of its
+  contributors may be used to endorse or promote products derived from
+  this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
--- a/joint_sentence_compression_model/libs/sentence_compression/README.md
+++ b/joint_sentence_compression_model/libs/sentence_compression/README.md
@ -0,0 +1,31 @@
+# Simple Model for Sentence Compression
+3-layered BILSTM model for sentence compression, referred as Baseline in [Klerke et al., NAACL 2016](http://aclweb.org/anthology/N/N16/N16-1179.pdf).
+## Requirements
+### Framework
+ - python (<= 3.6)
+ - pytorch (<= 0.3.0)
+ 
+### Packages
+ - torchtext
+ 
+## How to run
+```
+./getdata
+python main.py
+```
+To run the scripts with gpu, use this command `python main.py --gpu-id ID`, which ID is the integer from 0 to the number of gpus what you have. 
+## Reference
+
+```
+@InProceedings{klerke-goldberg-sogaard:2016:N16-1,
+  author    = {Klerke, Sigrid  and  Goldberg, Yoav  and  S{\o}gaard, Anders},
+  title     = {Improving sentence compression by learning to predict gaze},
+  booktitle = {Proceedings of the 2016 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies},
+  month     = {June},
+  year      = {2016},
+  address   = {San Diego, California},
+  publisher = {Association for Computational Linguistics},
+  pages     = {1528--1533},
+  url       = {http://www.aclweb.org/anthology/N16-1179}
+}
+```
--- a/joint_sentence_compression_model/libs/sentence_compression/init.py
+++ b/joint_sentence_compression_model/libs/sentence_compression/init.py
@ -0,0 +1 @@
+from .main import *
--- a/joint_sentence_compression_model/libs/sentence_compression/batch.py
+++ b/joint_sentence_compression_model/libs/sentence_compression/batch.py
@ -0,0 +1,95 @@
+from torchtext import data
+from const import Phase
+
+
+def create_dataset(data: dict, batch_size: int, device: int):
+
+    train = Dataset(data[Phase.TRAIN]['tokens'],
+                    data[Phase.TRAIN]['labels'],
+                    vocab=None,
+                    batch_size=batch_size,
+                    device=device,
+                    phase=Phase.TRAIN)
+
+    dev = Dataset(data[Phase.DEV]['tokens'],
+                  data[Phase.DEV]['labels'],
+                  vocab=train.vocab,
+                  batch_size=batch_size,
+                  device=device,
+                  phase=Phase.DEV)
+
+    test = Dataset(data[Phase.TEST]['tokens'],
+                   data[Phase.TEST]['labels'],
+                   vocab=train.vocab,
+                   batch_size=batch_size,
+                   device=device,
+                   phase=Phase.TEST)
+    return train, dev, test
+
+
+class Dataset:
+    def __init__(self,
+                 tokens: list,
+                 label_list: list,
+                 vocab: list,
+                 batch_size: int,
+                 device: int,
+                 phase: Phase):
+        assert len(tokens) == len(label_list), \
+            'the number of sentences and the number of POS/head sequences \
+             should be the same length'
+
+        self.pad_token = '<PAD>'
+        # self.unk_token = '<UNK>'
+        self.tokens = tokens
+        self.label_list = label_list
+        self.sentence_id = [[i] for i in range(len(tokens))]
+        self.device = device
+
+        self.token_field = data.Field(use_vocab=True,
+                                      # unk_token=self.unk_token,
+                                      pad_token=self.pad_token,
+                                      batch_first=True)
+        self.label_field = data.Field(use_vocab=False, pad_token=-1, batch_first=True)
+        self.sentence_id_field = data.Field(use_vocab=False, batch_first=True)
+        self.dataset = self._create_dataset()
+
+        if vocab is None:
+            self.token_field.build_vocab(self.tokens)
+            self.vocab = self.token_field.vocab
+        else:
+            self.token_field.vocab = vocab
+            self.vocab = vocab
+        self.pad_index = self.token_field.vocab.stoi[self.pad_token]
+
+        self._set_batch_iter(batch_size, phase)
+
+    def get_raw_sentence(self, sentences):
+        return [[self.vocab.itos[idx] for idx in sentence]
+                for sentence in sentences]
+
+    def _create_dataset(self):
+        _fields = [('token', self.token_field),
+                   ('label', self.label_field),
+                   ('sentence_id', self.sentence_id_field)]
+        return data.Dataset(self._get_examples(_fields), _fields)
+
+    def _get_examples(self, fields: list):
+        ex = []
+        for sentence, label, sentence_id in zip(self.tokens, self.label_list, self.sentence_id):
+            ex.append(data.Example.fromlist([sentence, label, sentence_id], fields))
+        return ex
+
+    def _set_batch_iter(self, batch_size: int, phase: Phase):
+
+        def sort(data: data.Dataset) -> int:
+            return len(getattr(data, 'token'))
+
+        train = True if phase == Phase.TRAIN else False
+
+        self.batch_iter = data.BucketIterator(dataset=self.dataset,
+                                              batch_size=batch_size,
+                                              sort_key=sort,
+                                              train=train,
+                                              repeat=False,
+                                              device=self.device)
--- a/joint_sentence_compression_model/libs/sentence_compression/const.py
+++ b/joint_sentence_compression_model/libs/sentence_compression/const.py
@ -0,0 +1,8 @@
+from enum import Enum, unique
+
+
+@unique
+class Phase(Enum):
+    TRAIN = 'train'
+    DEV = 'dev'
+    TEST = 'test'
--- a/joint_sentence_compression_model/libs/sentence_compression/main.py
+++ b/joint_sentence_compression_model/libs/sentence_compression/main.py
@ -0,0 +1,92 @@
+import torch
+import torch.nn as nn
+from torch.autograd import Variable
+
+
+class Network(nn.Module):
+    def __init__(self,
+                 embeddings,
+                 hidden_size: int,
+                 prior,
+                 device: torch.device):
+
+        super(Network, self).__init__()
+        self.device = device
+        self.priors = torch.log(torch.tensor([prior, 1-prior])).to(device)
+        self.hidden_size = hidden_size
+        self.bilstm_layers = 3
+        self.bilstm_input_size = 300
+        self.bilstm_output_size = 2 * hidden_size
+        self.word_emb = nn.Embedding.from_pretrained(embeddings, freeze=False)
+        self.bilstm = nn.LSTM(self.bilstm_input_size,
+                              self.hidden_size,
+                              num_layers=self.bilstm_layers,
+                              batch_first=True,
+                              dropout=0.1, #ms best mod 0.1
+                              bidirectional=True)
+        self.dropout = nn.Dropout(p=0.35)
+        if self.attention:
+            self.attention_size = self.bilstm_output_size * 2
+            self.u_a = nn.Linear(self.bilstm_output_size, self.bilstm_output_size)
+            self.w_a = nn.Linear(self.bilstm_output_size, self.bilstm_output_size)
+            self.v_a_inv = nn.Linear(self.bilstm_output_size, 1, bias=False)
+            self.linear_attn = nn.Linear(self.attention_size, self.bilstm_output_size)
+        self.linear = nn.Linear(self.bilstm_output_size, self.hidden_size)
+        self.pred = nn.Linear(self.hidden_size, 2)
+        self.softmax = nn.LogSoftmax(dim=1)
+        self.criterion = nn.NLLLoss(ignore_index=-1)
+
+    def forward(self, input_tokens, labels, fixations=None):
+        loss = 0.0
+        preds = []
+        atts = []
+        batch_size, seq_len = input_tokens.size()
+        self.init_hidden(batch_size, device=self.device)
+
+        x_i = self.word_emb(input_tokens)
+        x_i = self.dropout(x_i)
+
+        hidden, (self.h_n, self.c_n) = self.bilstm(x_i, (self.h_n, self.c_n))
+        _, _, hidden_size = hidden.size()
+
+        for i in range(seq_len):
+            nth_hidden = hidden[:, i, :]
+            if self.attention:
+                target = nth_hidden.expand(seq_len, batch_size, -1).transpose(0, 1)
+                mask = hidden.eq(target)[:, :, 0].unsqueeze(2)
+                attn_weight = self.attention(hidden, target, fixations, mask)
+                context_vector = torch.bmm(attn_weight.transpose(1, 2), hidden).squeeze(1)
+
+                nth_hidden = torch.tanh(self.linear_attn(torch.cat((nth_hidden, context_vector), -1)))
+                atts.append(attn_weight.detach().cpu())
+            logits = self.pred(self.linear(nth_hidden))
+            if not self.training:
+                logits = logits + self.priors 
+            output = self.softmax(logits)
+            loss += self.criterion(output, labels[:, i])
+
+            _, topi = output.topk(k=1, dim=1)
+            pred = topi.squeeze(-1)
+            preds.append(pred)
+
+        preds = torch.stack(torch.cat(preds, dim=0).split(batch_size), dim=1)
+
+        if atts:
+            atts = torch.stack(torch.cat(atts, dim=0).split(batch_size), dim=1)
+
+        return loss, preds, atts
+
+    def attention(self, source, target, fixations=None, mask=None):
+        function_g = \
+            self.v_a_inv(torch.tanh(self.u_a(source) + self.w_a(target)))
+        if mask is not None:
+            function_g.masked_fill_(mask, -1e4)
+        if fixations is not None:
+            function_g = function_g*fixations
+        return nn.functional.softmax(function_g, dim=1)
+
+    def init_hidden(self, batch_size, device):
+        zeros = Variable(torch.zeros(2*self.bilstm_layers, batch_size, self.hidden_size))
+        self.h_n = zeros.to(device)
+        self.c_n = zeros.to(device)
+        return self.h_n, self.c_n
--- a/joint_sentence_compression_model/libs/sentence_compression/run.py
+++ b/joint_sentence_compression_model/libs/sentence_compression/run.py
@ -0,0 +1,183 @@
+import torch
+from torch import optim
+import tqdm
+
+from const import Phase
+from batch import create_dataset
+from models import Baseline
+from sklearn.metrics import classification_report
+
+
+def run(dataset_train,
+        dataset_dev,
+        dataset_test,
+        model_type,
+        word_embed_size,
+        hidden_size,
+        batch_size,
+        device,
+        n_epochs):
+
+    if model_type == 'base':
+        model = Baseline(vocab=dataset_train.vocab,
+                         word_embed_size=word_embed_size,
+                         hidden_size=hidden_size,
+                         device=device,
+                         inference=False)
+    else:
+        raise NotImplementedError
+    model = model.to(device)
+
+    optim_params = model.parameters()
+    optimizer = optim.Adam(optim_params, lr=10**-3)
+
+    print('start training')
+    for epoch in range(n_epochs):
+        train_loss, tokens, preds, golds = train(dataset_train,
+                                                 model,
+                                                 optimizer,
+                                                 batch_size,
+                                                 epoch,
+                                                 Phase.TRAIN,
+                                                 device)
+
+        dev_loss, tokens, preds, golds = train(dataset_dev,
+                                               model,
+                                               optimizer,
+                                               batch_size,
+                                               epoch,
+                                               Phase.DEV,
+                                               device)
+        logger = '\t'.join(['epoch {}'.format(epoch+1),
+                            'TRAIN Loss: {:.9f}'.format(train_loss),
+                            'DEV Loss: {:.9f}'.format(dev_loss)])
+        # print('\r'+logger, end='')
+        print(logger)
+    test_loss, tokens, preds, golds = train(dataset_test,
+                                            model,
+                                            optimizer,
+                                            batch_size,
+                                            epoch,
+                                            Phase.TEST,
+                                            device)
+    print('====', 'TEST', '=====')
+    print_scores(preds, golds)
+    output_results(tokens, preds, golds)
+
+
+def train(dataset,
+          model,
+          optimizer,
+          batch_size,
+          n_epoch,
+          phase,
+          device):
+
+    total_loss = 0.0
+    tokens = []
+    preds = []
+    labels = []
+    if phase == Phase.TRAIN:
+        model.train()
+    else:
+        model.eval()
+
+    for batch in tqdm.tqdm(dataset.batch_iter):
+        token = getattr(batch, 'token')
+        label = getattr(batch, 'label')
+        raw_sentences = dataset.get_raw_sentence(token.data.detach().cpu().numpy())
+
+        loss, pred = \
+            model(token, raw_sentences, label, phase)
+
+        if phase == Phase.TRAIN:
+            optimizer.zero_grad()
+            torch.nn.utils.clip_grad_norm(model.parameters(), max_norm=5)
+            loss.backward()
+            optimizer.step()
+
+        # remove PAD from input sentences/labels and results
+        mask = (token != dataset.pad_index)
+        length_tensor = mask.sum(1)
+        length_tensor = length_tensor.data.detach().cpu().numpy()
+
+        for index, n_tokens_in_the_sentence in enumerate(length_tensor):
+            if n_tokens_in_the_sentence > 0:
+                tokens.append(raw_sentences[index][:n_tokens_in_the_sentence])
+                _label = label[index][:n_tokens_in_the_sentence]
+                _pred = pred[index][:n_tokens_in_the_sentence]
+                _label = _label.data.detach().cpu().numpy()
+                _pred = _pred.data.detach().cpu().numpy()
+                labels.append(_label)
+                preds.append(_pred)
+
+        total_loss += torch.mean(loss).item()
+
+    return total_loss, tokens, preds, labels
+
+
+def read_two_cols_data(fname, max_len=None):
+    data = {}
+    tokens = []
+    labels = []
+    token = []
+    label = []
+    with open(fname, mode='r') as f:
+        for line in f:
+            line = line.strip().lower().split()
+            if line:
+                try:
+                    _token, _label = line
+                except ValueError:
+                    raise
+                token.append(_token)
+                if _label == '0' or _label == '1':
+                    label.append(int(_label))
+                else:
+                    if _label == 'del':
+                        label.append(1)
+                    else:
+                        label.append(0)
+            else:
+                if max_len is None or len(token) <= max_len:
+                    tokens.append(token)
+                    labels.append(label)
+                token = []
+                label = []
+
+    data['tokens'] = tokens
+    data['labels'] = labels
+    return data
+
+
+def load(train_path, dev_path, test_path, batch_size, max_len, device):
+    train = read_two_cols_data(train_path, max_len)
+    dev = read_two_cols_data(dev_path)
+    test = read_two_cols_data(test_path)
+    data = {Phase.TRAIN: train, Phase.DEV: dev, Phase.TEST: test}
+    return create_dataset(data, batch_size=batch_size, device=device)
+
+
+def print_scores(preds, golds):
+    _preds = [label for sublist in preds for label in sublist]
+    _golds = [label for sublist in golds for label in sublist]
+    target_names = ['not_del', 'del']
+    print(classification_report(_golds, _preds, target_names=target_names, digits=5))
+
+
+def output_results(tokens, preds, golds, path='./result/sentcomp'):
+    with open(path+'.original.txt', mode='w') as w, \
+            open(path+'.gold.txt', mode='w') as w_gold, \
+            open(path+'.pred.txt', mode='w') as w_pred:
+
+        for _tokens, _golds, _preds in zip(tokens, golds, preds):
+            for token, gold, pred in zip(_tokens, _golds, _preds):
+                w.write(token + ' ')
+                if gold == 0:
+                    w_gold.write(token + ' ')
+                # 0 -> keep, 1 -> delete
+                if pred == 0:
+                    w_pred.write(token + ' ')
+            w.write('\n')
+            w_gold.write('\n')
+            w_pred.write('\n')
--- a/joint_sentence_compression_model/libs/utils.py
+++ b/joint_sentence_compression_model/libs/utils.py
@ -0,0 +1,218 @@
+import json
+import logging
+import math
+import os
+import random
+import re
+import time
+
+import matplotlib.pyplot as plt
+import matplotlib.ticker as ticker
+from nltk.translate.bleu_score import sentence_bleu
+import numpy as np
+import torch
+import torch.nn as nn
+
+import config
+
+
+plt.switch_backend("agg")
+
+
+def load_glove(vocabulary):
+    logger = logging.getLogger(f"{__name__}.load_glove")
+    logger.info("loading embeddings")
+    try:
+        with open(f"glove.cache") as h:
+            cache = json.load(h)
+    except:
+        logger.info("cache doesn't exist")
+        cache = {}
+        cache[config.PAD] = [0] * 300
+        cache[config.SOS] = [0] * 300
+        cache[config.EOS] = [0] * 300
+        cache[config.UNK] = [0] * 300
+        cache[config.NOFIX] = [0] * 300
+    else:
+        logger.info("cache found")
+
+    cache_miss = False
+
+    if not set(vocabulary) <= set(cache):
+        cache_miss = True
+        logger.warn("cache miss, loading full embeddings")
+        data = {}
+        with open("glove.840B.300d.txt") as h:
+            for line in h:
+                word, *emb = line.strip().split()
+                try:
+                    data[word] = [float(x) for x in emb]
+                except:
+                    continue
+        logger.info("finished loading full embeddings")
+        for word in vocabulary:
+            try:
+                cache[word] = data[word]
+            except KeyError:
+                cache[word] = [0] * 300
+        logger.info("cache updated")
+
+    embeddings = []
+    for word in vocabulary:
+        embeddings.append(torch.tensor(cache[word], dtype=torch.float32))
+    embeddings = torch.stack(embeddings)
+
+    if cache_miss:
+        with open(f"glove.cache", "w") as h:
+            json.dump(cache, h)
+        logger.info("cache saved")
+
+    return embeddings
+
+
+def tokenize(s):
+    s = s.lower().strip()
+    s = re.sub(r"([.!?])", r" \1", s)
+    s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
+    s = s.split(" ")
+    return s
+
+
+def indices_from_sentence(word2index, sentence, unknown_threshold):
+    if unknown_threshold:
+        return [
+            word2index.get(
+                word if random.random() > unknown_threshold else config.UNK,
+                word2index[config.UNK],
+            )
+            for word in sentence
+        ]
+    else:
+        return [
+            word2index.get(word, word2index[config.UNK]) for word in sentence
+        ]
+
+
+def tensor_from_sentence(word2index, sentence, unknown_threshold):
+    indices = indices_from_sentence(word2index, sentence, unknown_threshold)
+    return torch.tensor(indices, dtype=torch.long, device=config.DEV)
+
+
+def tensors_from_pair(word2index, pair, shuffle, unknown_threshold):
+    tensors = [
+        tensor_from_sentence(word2index, pair[0], unknown_threshold),
+        tensor_from_sentence(word2index, pair[1], unknown_threshold),
+    ]
+    if shuffle:
+        random.shuffle(tensors)
+    return tensors
+
+
+def bleu(reference, hypothesis, n=4):
+    if n < 1:
+        return 0
+    weights = [1/n]*n
+    return sentence_bleu([reference], hypothesis, weights)
+
+
+def pair_iter(pairs, word2index, shuffle=False, shuffle_pairs=False, unknown_threshold=0.00):
+    if shuffle:
+        pairs = pairs.copy()
+        random.shuffle(pairs)
+    for pair in pairs:
+        tensor1, tensor2 = tensors_from_pair(word2index, (pair[0], pair[1]), shuffle_pairs, unknown_threshold)
+        yield (tensor1,), (tensor2,)
+
+
+def sent_iter(sents, word2index, batch_size, unknown_threshold=0.00):
+    for i in range(len(sents)//batch_size+1):
+        raw_sents = [x[0] for x in sents[i*batch_size:i*batch_size+batch_size]]
+        _sents = [tensor_from_sentence(word2index, sent, unknown_threshold) for sent, target in sents[i*batch_size:i*batch_size+batch_size]]
+        _targets = [torch.tensor(target, dtype=torch.long).to(config.DEV) for sent, target in sents[i*batch_size:i*batch_size+batch_size]]
+        if raw_sents and _sents and _targets:
+            yield(raw_sents, _sents, _targets)
+
+
+def batch_iter(pairs, word2index, batch_size, shuffle=False, unknown_threshold=0.00):
+    for i in range(len(pairs) // batch_size):
+        batch = pairs[i : i + batch_size]
+        if len(batch) != batch_size:
+            continue
+        batch_tensors = [
+            tensors_from_pair(word2index, (pair[0], pair[1]), shuffle, unknown_threshold)
+            for pair in batch
+        ]
+
+        tensors1, tensors2 = zip(*batch_tensors)
+
+        yield tensors1, tensors2
+
+
+def asMinutes(s):
+    m = math.floor(s / 60)
+    s -= m * 60
+    return "%dm %ds" % (m, s)
+
+
+def timeSince(since, percent):
+    now = time.time()
+    s = now - since
+    es = s / (percent)
+    rs = es - s
+    return "%s (- %s)" % (asMinutes(s), asMinutes(rs))
+
+
+def showPlot(points):
+    plt.figure()
+    fig, ax = plt.subplots()
+    # this locator puts ticks at regular intervals
+    loc = ticker.MultipleLocator(base=0.2)
+    ax.yaxis.set_major_locator(loc)
+    plt.plot(points)
+
+
+def showAttention(input_sentence, output_words, attentions):
+    # Set up figure with colorbar
+    fig = plt.figure()
+    ax = fig.add_subplot(111)
+    cax = ax.matshow(attentions.numpy(), cmap="bone")
+    fig.colorbar(cax)
+
+    # Set up axes
+    ax.set_xticklabels([""] + input_sentence.split(" ") + ["<__EOS__>"], rotation=90)
+    ax.set_yticklabels([""] + output_words)
+
+    # Show label at every tick
+    ax.xaxis.set_major_locator(ticker.MultipleLocator(1))
+    ax.yaxis.set_major_locator(ticker.MultipleLocator(1))
+
+    plt.show()
+
+
+def evaluateAndShowAttention(input_sentence):
+    output_words, attentions = evaluate(encoder1, attn_decoder1, input_sentence)
+    print("input =", input_sentence)
+    print("output =", " ".join(output_words))
+    showAttention(input_sentence, output_words, attentions)
+
+
+def save_model(model, word2index, path):
+    if not path.endswith(".tar"):
+        path += ".tar"
+    torch.save(
+        {"weights": model.state_dict(), "word2index": word2index},
+        path,
+    )
+
+
+def load_model(path):
+    checkpoint = torch.load(path)
+    return checkpoint["weights"], checkpoint["word2index"]
+
+
+def extend_vocabulary(word2index, langs):
+    for lang in langs:
+        for word in lang.word2index:
+            if word not in word2index:
+                word2index[word] = len(word2index)
+    return word2index