Add NLP task models

2020-12-08 21:10:52 +01:00 · 2020-12-08 21:10:52 +01:00 · 69f6de0ace
commit 69f6de0ace
parent d8beb17dfb
46 changed files with 4976 additions and 0 deletions
--- a/joint_sentence_compression_model/libs/sentence_compression/LICENSE
+++ b/joint_sentence_compression_model/libs/sentence_compression/LICENSE
@ -0,0 +1,29 @@
+BSD 3-Clause License
+
+Copyright (c) 2018, Tatsuya Aoki 
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+* Redistributions of source code must retain the above copyright notice, this
+  list of conditions and the following disclaimer.
+
+* Redistributions in binary form must reproduce the above copyright notice,
+  this list of conditions and the following disclaimer in the documentation
+  and/or other materials provided with the distribution.
+
+* Neither the name of the copyright holder nor the names of its
+  contributors may be used to endorse or promote products derived from
+  this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
--- a/joint_sentence_compression_model/libs/sentence_compression/README.md
+++ b/joint_sentence_compression_model/libs/sentence_compression/README.md
@ -0,0 +1,31 @@
+# Simple Model for Sentence Compression
+3-layered BILSTM model for sentence compression, referred as Baseline in [Klerke et al., NAACL 2016](http://aclweb.org/anthology/N/N16/N16-1179.pdf).
+## Requirements
+### Framework
+ - python (<= 3.6)
+ - pytorch (<= 0.3.0)
+ 
+### Packages
+ - torchtext
+ 
+## How to run
+```
+./getdata
+python main.py
+```
+To run the scripts with gpu, use this command `python main.py --gpu-id ID`, which ID is the integer from 0 to the number of gpus what you have. 
+## Reference
+
+```
+@InProceedings{klerke-goldberg-sogaard:2016:N16-1,
+  author    = {Klerke, Sigrid  and  Goldberg, Yoav  and  S{\o}gaard, Anders},
+  title     = {Improving sentence compression by learning to predict gaze},
+  booktitle = {Proceedings of the 2016 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies},
+  month     = {June},
+  year      = {2016},
+  address   = {San Diego, California},
+  publisher = {Association for Computational Linguistics},
+  pages     = {1528--1533},
+  url       = {http://www.aclweb.org/anthology/N16-1179}
+}
+```
--- a/joint_sentence_compression_model/libs/sentence_compression/init.py
+++ b/joint_sentence_compression_model/libs/sentence_compression/init.py
@ -0,0 +1 @@
+from .main import *
--- a/joint_sentence_compression_model/libs/sentence_compression/batch.py
+++ b/joint_sentence_compression_model/libs/sentence_compression/batch.py
@ -0,0 +1,95 @@
+from torchtext import data
+from const import Phase
+
+
+def create_dataset(data: dict, batch_size: int, device: int):
+
+    train = Dataset(data[Phase.TRAIN]['tokens'],
+                    data[Phase.TRAIN]['labels'],
+                    vocab=None,
+                    batch_size=batch_size,
+                    device=device,
+                    phase=Phase.TRAIN)
+
+    dev = Dataset(data[Phase.DEV]['tokens'],
+                  data[Phase.DEV]['labels'],
+                  vocab=train.vocab,
+                  batch_size=batch_size,
+                  device=device,
+                  phase=Phase.DEV)
+
+    test = Dataset(data[Phase.TEST]['tokens'],
+                   data[Phase.TEST]['labels'],
+                   vocab=train.vocab,
+                   batch_size=batch_size,
+                   device=device,
+                   phase=Phase.TEST)
+    return train, dev, test
+
+
+class Dataset:
+    def __init__(self,
+                 tokens: list,
+                 label_list: list,
+                 vocab: list,
+                 batch_size: int,
+                 device: int,
+                 phase: Phase):
+        assert len(tokens) == len(label_list), \
+            'the number of sentences and the number of POS/head sequences \
+             should be the same length'
+
+        self.pad_token = '<PAD>'
+        # self.unk_token = '<UNK>'
+        self.tokens = tokens
+        self.label_list = label_list
+        self.sentence_id = [[i] for i in range(len(tokens))]
+        self.device = device
+
+        self.token_field = data.Field(use_vocab=True,
+                                      # unk_token=self.unk_token,
+                                      pad_token=self.pad_token,
+                                      batch_first=True)
+        self.label_field = data.Field(use_vocab=False, pad_token=-1, batch_first=True)
+        self.sentence_id_field = data.Field(use_vocab=False, batch_first=True)
+        self.dataset = self._create_dataset()
+
+        if vocab is None:
+            self.token_field.build_vocab(self.tokens)
+            self.vocab = self.token_field.vocab
+        else:
+            self.token_field.vocab = vocab
+            self.vocab = vocab
+        self.pad_index = self.token_field.vocab.stoi[self.pad_token]
+
+        self._set_batch_iter(batch_size, phase)
+
+    def get_raw_sentence(self, sentences):
+        return [[self.vocab.itos[idx] for idx in sentence]
+                for sentence in sentences]
+
+    def _create_dataset(self):
+        _fields = [('token', self.token_field),
+                   ('label', self.label_field),
+                   ('sentence_id', self.sentence_id_field)]
+        return data.Dataset(self._get_examples(_fields), _fields)
+
+    def _get_examples(self, fields: list):
+        ex = []
+        for sentence, label, sentence_id in zip(self.tokens, self.label_list, self.sentence_id):
+            ex.append(data.Example.fromlist([sentence, label, sentence_id], fields))
+        return ex
+
+    def _set_batch_iter(self, batch_size: int, phase: Phase):
+
+        def sort(data: data.Dataset) -> int:
+            return len(getattr(data, 'token'))
+
+        train = True if phase == Phase.TRAIN else False
+
+        self.batch_iter = data.BucketIterator(dataset=self.dataset,
+                                              batch_size=batch_size,
+                                              sort_key=sort,
+                                              train=train,
+                                              repeat=False,
+                                              device=self.device)
--- a/joint_sentence_compression_model/libs/sentence_compression/const.py
+++ b/joint_sentence_compression_model/libs/sentence_compression/const.py
@ -0,0 +1,8 @@
+from enum import Enum, unique
+
+
+@unique
+class Phase(Enum):
+    TRAIN = 'train'
+    DEV = 'dev'
+    TEST = 'test'
--- a/joint_sentence_compression_model/libs/sentence_compression/main.py
+++ b/joint_sentence_compression_model/libs/sentence_compression/main.py
@ -0,0 +1,92 @@
+import torch
+import torch.nn as nn
+from torch.autograd import Variable
+
+
+class Network(nn.Module):
+    def __init__(self,
+                 embeddings,
+                 hidden_size: int,
+                 prior,
+                 device: torch.device):
+
+        super(Network, self).__init__()
+        self.device = device
+        self.priors = torch.log(torch.tensor([prior, 1-prior])).to(device)
+        self.hidden_size = hidden_size
+        self.bilstm_layers = 3
+        self.bilstm_input_size = 300
+        self.bilstm_output_size = 2 * hidden_size
+        self.word_emb = nn.Embedding.from_pretrained(embeddings, freeze=False)
+        self.bilstm = nn.LSTM(self.bilstm_input_size,
+                              self.hidden_size,
+                              num_layers=self.bilstm_layers,
+                              batch_first=True,
+                              dropout=0.1, #ms best mod 0.1
+                              bidirectional=True)
+        self.dropout = nn.Dropout(p=0.35)
+        if self.attention:
+            self.attention_size = self.bilstm_output_size * 2
+            self.u_a = nn.Linear(self.bilstm_output_size, self.bilstm_output_size)
+            self.w_a = nn.Linear(self.bilstm_output_size, self.bilstm_output_size)
+            self.v_a_inv = nn.Linear(self.bilstm_output_size, 1, bias=False)
+            self.linear_attn = nn.Linear(self.attention_size, self.bilstm_output_size)
+        self.linear = nn.Linear(self.bilstm_output_size, self.hidden_size)
+        self.pred = nn.Linear(self.hidden_size, 2)
+        self.softmax = nn.LogSoftmax(dim=1)
+        self.criterion = nn.NLLLoss(ignore_index=-1)
+
+    def forward(self, input_tokens, labels, fixations=None):
+        loss = 0.0
+        preds = []
+        atts = []
+        batch_size, seq_len = input_tokens.size()
+        self.init_hidden(batch_size, device=self.device)
+
+        x_i = self.word_emb(input_tokens)
+        x_i = self.dropout(x_i)
+
+        hidden, (self.h_n, self.c_n) = self.bilstm(x_i, (self.h_n, self.c_n))
+        _, _, hidden_size = hidden.size()
+
+        for i in range(seq_len):
+            nth_hidden = hidden[:, i, :]
+            if self.attention:
+                target = nth_hidden.expand(seq_len, batch_size, -1).transpose(0, 1)
+                mask = hidden.eq(target)[:, :, 0].unsqueeze(2)
+                attn_weight = self.attention(hidden, target, fixations, mask)
+                context_vector = torch.bmm(attn_weight.transpose(1, 2), hidden).squeeze(1)
+
+                nth_hidden = torch.tanh(self.linear_attn(torch.cat((nth_hidden, context_vector), -1)))
+                atts.append(attn_weight.detach().cpu())
+            logits = self.pred(self.linear(nth_hidden))
+            if not self.training:
+                logits = logits + self.priors 
+            output = self.softmax(logits)
+            loss += self.criterion(output, labels[:, i])
+
+            _, topi = output.topk(k=1, dim=1)
+            pred = topi.squeeze(-1)
+            preds.append(pred)
+
+        preds = torch.stack(torch.cat(preds, dim=0).split(batch_size), dim=1)
+
+        if atts:
+            atts = torch.stack(torch.cat(atts, dim=0).split(batch_size), dim=1)
+
+        return loss, preds, atts
+
+    def attention(self, source, target, fixations=None, mask=None):
+        function_g = \
+            self.v_a_inv(torch.tanh(self.u_a(source) + self.w_a(target)))
+        if mask is not None:
+            function_g.masked_fill_(mask, -1e4)
+        if fixations is not None:
+            function_g = function_g*fixations
+        return nn.functional.softmax(function_g, dim=1)
+
+    def init_hidden(self, batch_size, device):
+        zeros = Variable(torch.zeros(2*self.bilstm_layers, batch_size, self.hidden_size))
+        self.h_n = zeros.to(device)
+        self.c_n = zeros.to(device)
+        return self.h_n, self.c_n
--- a/joint_sentence_compression_model/libs/sentence_compression/run.py
+++ b/joint_sentence_compression_model/libs/sentence_compression/run.py
@ -0,0 +1,183 @@
+import torch
+from torch import optim
+import tqdm
+
+from const import Phase
+from batch import create_dataset
+from models import Baseline
+from sklearn.metrics import classification_report
+
+
+def run(dataset_train,
+        dataset_dev,
+        dataset_test,
+        model_type,
+        word_embed_size,
+        hidden_size,
+        batch_size,
+        device,
+        n_epochs):
+
+    if model_type == 'base':
+        model = Baseline(vocab=dataset_train.vocab,
+                         word_embed_size=word_embed_size,
+                         hidden_size=hidden_size,
+                         device=device,
+                         inference=False)
+    else:
+        raise NotImplementedError
+    model = model.to(device)
+
+    optim_params = model.parameters()
+    optimizer = optim.Adam(optim_params, lr=10**-3)
+
+    print('start training')
+    for epoch in range(n_epochs):
+        train_loss, tokens, preds, golds = train(dataset_train,
+                                                 model,
+                                                 optimizer,
+                                                 batch_size,
+                                                 epoch,
+                                                 Phase.TRAIN,
+                                                 device)
+
+        dev_loss, tokens, preds, golds = train(dataset_dev,
+                                               model,
+                                               optimizer,
+                                               batch_size,
+                                               epoch,
+                                               Phase.DEV,
+                                               device)
+        logger = '\t'.join(['epoch {}'.format(epoch+1),
+                            'TRAIN Loss: {:.9f}'.format(train_loss),
+                            'DEV Loss: {:.9f}'.format(dev_loss)])
+        # print('\r'+logger, end='')
+        print(logger)
+    test_loss, tokens, preds, golds = train(dataset_test,
+                                            model,
+                                            optimizer,
+                                            batch_size,
+                                            epoch,
+                                            Phase.TEST,
+                                            device)
+    print('====', 'TEST', '=====')
+    print_scores(preds, golds)
+    output_results(tokens, preds, golds)
+
+
+def train(dataset,
+          model,
+          optimizer,
+          batch_size,
+          n_epoch,
+          phase,
+          device):
+
+    total_loss = 0.0
+    tokens = []
+    preds = []
+    labels = []
+    if phase == Phase.TRAIN:
+        model.train()
+    else:
+        model.eval()
+
+    for batch in tqdm.tqdm(dataset.batch_iter):
+        token = getattr(batch, 'token')
+        label = getattr(batch, 'label')
+        raw_sentences = dataset.get_raw_sentence(token.data.detach().cpu().numpy())
+
+        loss, pred = \
+            model(token, raw_sentences, label, phase)
+
+        if phase == Phase.TRAIN:
+            optimizer.zero_grad()
+            torch.nn.utils.clip_grad_norm(model.parameters(), max_norm=5)
+            loss.backward()
+            optimizer.step()
+
+        # remove PAD from input sentences/labels and results
+        mask = (token != dataset.pad_index)
+        length_tensor = mask.sum(1)
+        length_tensor = length_tensor.data.detach().cpu().numpy()
+
+        for index, n_tokens_in_the_sentence in enumerate(length_tensor):
+            if n_tokens_in_the_sentence > 0:
+                tokens.append(raw_sentences[index][:n_tokens_in_the_sentence])
+                _label = label[index][:n_tokens_in_the_sentence]
+                _pred = pred[index][:n_tokens_in_the_sentence]
+                _label = _label.data.detach().cpu().numpy()
+                _pred = _pred.data.detach().cpu().numpy()
+                labels.append(_label)
+                preds.append(_pred)
+
+        total_loss += torch.mean(loss).item()
+
+    return total_loss, tokens, preds, labels
+
+
+def read_two_cols_data(fname, max_len=None):
+    data = {}
+    tokens = []
+    labels = []
+    token = []
+    label = []
+    with open(fname, mode='r') as f:
+        for line in f:
+            line = line.strip().lower().split()
+            if line:
+                try:
+                    _token, _label = line
+                except ValueError:
+                    raise
+                token.append(_token)
+                if _label == '0' or _label == '1':
+                    label.append(int(_label))
+                else:
+                    if _label == 'del':
+                        label.append(1)
+                    else:
+                        label.append(0)
+            else:
+                if max_len is None or len(token) <= max_len:
+                    tokens.append(token)
+                    labels.append(label)
+                token = []
+                label = []
+
+    data['tokens'] = tokens
+    data['labels'] = labels
+    return data
+
+
+def load(train_path, dev_path, test_path, batch_size, max_len, device):
+    train = read_two_cols_data(train_path, max_len)
+    dev = read_two_cols_data(dev_path)
+    test = read_two_cols_data(test_path)
+    data = {Phase.TRAIN: train, Phase.DEV: dev, Phase.TEST: test}
+    return create_dataset(data, batch_size=batch_size, device=device)
+
+
+def print_scores(preds, golds):
+    _preds = [label for sublist in preds for label in sublist]
+    _golds = [label for sublist in golds for label in sublist]
+    target_names = ['not_del', 'del']
+    print(classification_report(_golds, _preds, target_names=target_names, digits=5))
+
+
+def output_results(tokens, preds, golds, path='./result/sentcomp'):
+    with open(path+'.original.txt', mode='w') as w, \
+            open(path+'.gold.txt', mode='w') as w_gold, \
+            open(path+'.pred.txt', mode='w') as w_pred:
+
+        for _tokens, _golds, _preds in zip(tokens, golds, preds):
+            for token, gold, pred in zip(_tokens, _golds, _preds):
+                w.write(token + ' ')
+                if gold == 0:
+                    w_gold.write(token + ' ')
+                # 0 -> keep, 1 -> delete
+                if pred == 0:
+                    w_pred.write(token + ' ')
+            w.write('\n')
+            w_gold.write('\n')
+            w_pred.write('\n')