Add NLP task models

This commit is contained in:
Ekta Sood 2020-12-08 21:10:52 +01:00
parent d8beb17dfb
commit 69f6de0ace
46 changed files with 4976 additions and 0 deletions

View file

@ -0,0 +1,29 @@
BSD 3-Clause License
Copyright (c) 2018, Tatsuya Aoki
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright notice, this
list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright notice,
this list of conditions and the following disclaimer in the documentation
and/or other materials provided with the distribution.
* Neither the name of the copyright holder nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

View file

@ -0,0 +1,31 @@
# Simple Model for Sentence Compression
3-layered BILSTM model for sentence compression, referred as Baseline in [Klerke et al., NAACL 2016](http://aclweb.org/anthology/N/N16/N16-1179.pdf).
## Requirements
### Framework
- python (<= 3.6)
- pytorch (<= 0.3.0)
### Packages
- torchtext
## How to run
```
./getdata
python main.py
```
To run the scripts with gpu, use this command `python main.py --gpu-id ID`, which ID is the integer from 0 to the number of gpus what you have.
## Reference
```
@InProceedings{klerke-goldberg-sogaard:2016:N16-1,
author = {Klerke, Sigrid and Goldberg, Yoav and S{\o}gaard, Anders},
title = {Improving sentence compression by learning to predict gaze},
booktitle = {Proceedings of the 2016 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies},
month = {June},
year = {2016},
address = {San Diego, California},
publisher = {Association for Computational Linguistics},
pages = {1528--1533},
url = {http://www.aclweb.org/anthology/N16-1179}
}
```

View file

@ -0,0 +1 @@
from .main import *

View file

@ -0,0 +1,95 @@
from torchtext import data
from const import Phase
def create_dataset(data: dict, batch_size: int, device: int):
train = Dataset(data[Phase.TRAIN]['tokens'],
data[Phase.TRAIN]['labels'],
vocab=None,
batch_size=batch_size,
device=device,
phase=Phase.TRAIN)
dev = Dataset(data[Phase.DEV]['tokens'],
data[Phase.DEV]['labels'],
vocab=train.vocab,
batch_size=batch_size,
device=device,
phase=Phase.DEV)
test = Dataset(data[Phase.TEST]['tokens'],
data[Phase.TEST]['labels'],
vocab=train.vocab,
batch_size=batch_size,
device=device,
phase=Phase.TEST)
return train, dev, test
class Dataset:
def __init__(self,
tokens: list,
label_list: list,
vocab: list,
batch_size: int,
device: int,
phase: Phase):
assert len(tokens) == len(label_list), \
'the number of sentences and the number of POS/head sequences \
should be the same length'
self.pad_token = '<PAD>'
# self.unk_token = '<UNK>'
self.tokens = tokens
self.label_list = label_list
self.sentence_id = [[i] for i in range(len(tokens))]
self.device = device
self.token_field = data.Field(use_vocab=True,
# unk_token=self.unk_token,
pad_token=self.pad_token,
batch_first=True)
self.label_field = data.Field(use_vocab=False, pad_token=-1, batch_first=True)
self.sentence_id_field = data.Field(use_vocab=False, batch_first=True)
self.dataset = self._create_dataset()
if vocab is None:
self.token_field.build_vocab(self.tokens)
self.vocab = self.token_field.vocab
else:
self.token_field.vocab = vocab
self.vocab = vocab
self.pad_index = self.token_field.vocab.stoi[self.pad_token]
self._set_batch_iter(batch_size, phase)
def get_raw_sentence(self, sentences):
return [[self.vocab.itos[idx] for idx in sentence]
for sentence in sentences]
def _create_dataset(self):
_fields = [('token', self.token_field),
('label', self.label_field),
('sentence_id', self.sentence_id_field)]
return data.Dataset(self._get_examples(_fields), _fields)
def _get_examples(self, fields: list):
ex = []
for sentence, label, sentence_id in zip(self.tokens, self.label_list, self.sentence_id):
ex.append(data.Example.fromlist([sentence, label, sentence_id], fields))
return ex
def _set_batch_iter(self, batch_size: int, phase: Phase):
def sort(data: data.Dataset) -> int:
return len(getattr(data, 'token'))
train = True if phase == Phase.TRAIN else False
self.batch_iter = data.BucketIterator(dataset=self.dataset,
batch_size=batch_size,
sort_key=sort,
train=train,
repeat=False,
device=self.device)

View file

@ -0,0 +1,8 @@
from enum import Enum, unique
@unique
class Phase(Enum):
TRAIN = 'train'
DEV = 'dev'
TEST = 'test'

View file

@ -0,0 +1,92 @@
import torch
import torch.nn as nn
from torch.autograd import Variable
class Network(nn.Module):
def __init__(self,
embeddings,
hidden_size: int,
prior,
device: torch.device):
super(Network, self).__init__()
self.device = device
self.priors = torch.log(torch.tensor([prior, 1-prior])).to(device)
self.hidden_size = hidden_size
self.bilstm_layers = 3
self.bilstm_input_size = 300
self.bilstm_output_size = 2 * hidden_size
self.word_emb = nn.Embedding.from_pretrained(embeddings, freeze=False)
self.bilstm = nn.LSTM(self.bilstm_input_size,
self.hidden_size,
num_layers=self.bilstm_layers,
batch_first=True,
dropout=0.1, #ms best mod 0.1
bidirectional=True)
self.dropout = nn.Dropout(p=0.35)
if self.attention:
self.attention_size = self.bilstm_output_size * 2
self.u_a = nn.Linear(self.bilstm_output_size, self.bilstm_output_size)
self.w_a = nn.Linear(self.bilstm_output_size, self.bilstm_output_size)
self.v_a_inv = nn.Linear(self.bilstm_output_size, 1, bias=False)
self.linear_attn = nn.Linear(self.attention_size, self.bilstm_output_size)
self.linear = nn.Linear(self.bilstm_output_size, self.hidden_size)
self.pred = nn.Linear(self.hidden_size, 2)
self.softmax = nn.LogSoftmax(dim=1)
self.criterion = nn.NLLLoss(ignore_index=-1)
def forward(self, input_tokens, labels, fixations=None):
loss = 0.0
preds = []
atts = []
batch_size, seq_len = input_tokens.size()
self.init_hidden(batch_size, device=self.device)
x_i = self.word_emb(input_tokens)
x_i = self.dropout(x_i)
hidden, (self.h_n, self.c_n) = self.bilstm(x_i, (self.h_n, self.c_n))
_, _, hidden_size = hidden.size()
for i in range(seq_len):
nth_hidden = hidden[:, i, :]
if self.attention:
target = nth_hidden.expand(seq_len, batch_size, -1).transpose(0, 1)
mask = hidden.eq(target)[:, :, 0].unsqueeze(2)
attn_weight = self.attention(hidden, target, fixations, mask)
context_vector = torch.bmm(attn_weight.transpose(1, 2), hidden).squeeze(1)
nth_hidden = torch.tanh(self.linear_attn(torch.cat((nth_hidden, context_vector), -1)))
atts.append(attn_weight.detach().cpu())
logits = self.pred(self.linear(nth_hidden))
if not self.training:
logits = logits + self.priors
output = self.softmax(logits)
loss += self.criterion(output, labels[:, i])
_, topi = output.topk(k=1, dim=1)
pred = topi.squeeze(-1)
preds.append(pred)
preds = torch.stack(torch.cat(preds, dim=0).split(batch_size), dim=1)
if atts:
atts = torch.stack(torch.cat(atts, dim=0).split(batch_size), dim=1)
return loss, preds, atts
def attention(self, source, target, fixations=None, mask=None):
function_g = \
self.v_a_inv(torch.tanh(self.u_a(source) + self.w_a(target)))
if mask is not None:
function_g.masked_fill_(mask, -1e4)
if fixations is not None:
function_g = function_g*fixations
return nn.functional.softmax(function_g, dim=1)
def init_hidden(self, batch_size, device):
zeros = Variable(torch.zeros(2*self.bilstm_layers, batch_size, self.hidden_size))
self.h_n = zeros.to(device)
self.c_n = zeros.to(device)
return self.h_n, self.c_n

View file

@ -0,0 +1,183 @@
import torch
from torch import optim
import tqdm
from const import Phase
from batch import create_dataset
from models import Baseline
from sklearn.metrics import classification_report
def run(dataset_train,
dataset_dev,
dataset_test,
model_type,
word_embed_size,
hidden_size,
batch_size,
device,
n_epochs):
if model_type == 'base':
model = Baseline(vocab=dataset_train.vocab,
word_embed_size=word_embed_size,
hidden_size=hidden_size,
device=device,
inference=False)
else:
raise NotImplementedError
model = model.to(device)
optim_params = model.parameters()
optimizer = optim.Adam(optim_params, lr=10**-3)
print('start training')
for epoch in range(n_epochs):
train_loss, tokens, preds, golds = train(dataset_train,
model,
optimizer,
batch_size,
epoch,
Phase.TRAIN,
device)
dev_loss, tokens, preds, golds = train(dataset_dev,
model,
optimizer,
batch_size,
epoch,
Phase.DEV,
device)
logger = '\t'.join(['epoch {}'.format(epoch+1),
'TRAIN Loss: {:.9f}'.format(train_loss),
'DEV Loss: {:.9f}'.format(dev_loss)])
# print('\r'+logger, end='')
print(logger)
test_loss, tokens, preds, golds = train(dataset_test,
model,
optimizer,
batch_size,
epoch,
Phase.TEST,
device)
print('====', 'TEST', '=====')
print_scores(preds, golds)
output_results(tokens, preds, golds)
def train(dataset,
model,
optimizer,
batch_size,
n_epoch,
phase,
device):
total_loss = 0.0
tokens = []
preds = []
labels = []
if phase == Phase.TRAIN:
model.train()
else:
model.eval()
for batch in tqdm.tqdm(dataset.batch_iter):
token = getattr(batch, 'token')
label = getattr(batch, 'label')
raw_sentences = dataset.get_raw_sentence(token.data.detach().cpu().numpy())
loss, pred = \
model(token, raw_sentences, label, phase)
if phase == Phase.TRAIN:
optimizer.zero_grad()
torch.nn.utils.clip_grad_norm(model.parameters(), max_norm=5)
loss.backward()
optimizer.step()
# remove PAD from input sentences/labels and results
mask = (token != dataset.pad_index)
length_tensor = mask.sum(1)
length_tensor = length_tensor.data.detach().cpu().numpy()
for index, n_tokens_in_the_sentence in enumerate(length_tensor):
if n_tokens_in_the_sentence > 0:
tokens.append(raw_sentences[index][:n_tokens_in_the_sentence])
_label = label[index][:n_tokens_in_the_sentence]
_pred = pred[index][:n_tokens_in_the_sentence]
_label = _label.data.detach().cpu().numpy()
_pred = _pred.data.detach().cpu().numpy()
labels.append(_label)
preds.append(_pred)
total_loss += torch.mean(loss).item()
return total_loss, tokens, preds, labels
def read_two_cols_data(fname, max_len=None):
data = {}
tokens = []
labels = []
token = []
label = []
with open(fname, mode='r') as f:
for line in f:
line = line.strip().lower().split()
if line:
try:
_token, _label = line
except ValueError:
raise
token.append(_token)
if _label == '0' or _label == '1':
label.append(int(_label))
else:
if _label == 'del':
label.append(1)
else:
label.append(0)
else:
if max_len is None or len(token) <= max_len:
tokens.append(token)
labels.append(label)
token = []
label = []
data['tokens'] = tokens
data['labels'] = labels
return data
def load(train_path, dev_path, test_path, batch_size, max_len, device):
train = read_two_cols_data(train_path, max_len)
dev = read_two_cols_data(dev_path)
test = read_two_cols_data(test_path)
data = {Phase.TRAIN: train, Phase.DEV: dev, Phase.TEST: test}
return create_dataset(data, batch_size=batch_size, device=device)
def print_scores(preds, golds):
_preds = [label for sublist in preds for label in sublist]
_golds = [label for sublist in golds for label in sublist]
target_names = ['not_del', 'del']
print(classification_report(_golds, _preds, target_names=target_names, digits=5))
def output_results(tokens, preds, golds, path='./result/sentcomp'):
with open(path+'.original.txt', mode='w') as w, \
open(path+'.gold.txt', mode='w') as w_gold, \
open(path+'.pred.txt', mode='w') as w_pred:
for _tokens, _golds, _preds in zip(tokens, golds, preds):
for token, gold, pred in zip(_tokens, _golds, _preds):
w.write(token + ' ')
if gold == 0:
w_gold.write(token + ' ')
# 0 -> keep, 1 -> delete
if pred == 0:
w_pred.write(token + ' ')
w.write('\n')
w_gold.write('\n')
w_pred.write('\n')