Add NLP task models
This commit is contained in:
parent
d8beb17dfb
commit
69f6de0ace
46 changed files with 4976 additions and 0 deletions
|
@ -0,0 +1,29 @@
|
|||
BSD 3-Clause License
|
||||
|
||||
Copyright (c) 2018, Tatsuya Aoki
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright notice, this
|
||||
list of conditions and the following disclaimer.
|
||||
|
||||
* Redistributions in binary form must reproduce the above copyright notice,
|
||||
this list of conditions and the following disclaimer in the documentation
|
||||
and/or other materials provided with the distribution.
|
||||
|
||||
* Neither the name of the copyright holder nor the names of its
|
||||
contributors may be used to endorse or promote products derived from
|
||||
this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
@ -0,0 +1,31 @@
|
|||
# Simple Model for Sentence Compression
|
||||
3-layered BILSTM model for sentence compression, referred as Baseline in [Klerke et al., NAACL 2016](http://aclweb.org/anthology/N/N16/N16-1179.pdf).
|
||||
## Requirements
|
||||
### Framework
|
||||
- python (<= 3.6)
|
||||
- pytorch (<= 0.3.0)
|
||||
|
||||
### Packages
|
||||
- torchtext
|
||||
|
||||
## How to run
|
||||
```
|
||||
./getdata
|
||||
python main.py
|
||||
```
|
||||
To run the scripts with gpu, use this command `python main.py --gpu-id ID`, which ID is the integer from 0 to the number of gpus what you have.
|
||||
## Reference
|
||||
|
||||
```
|
||||
@InProceedings{klerke-goldberg-sogaard:2016:N16-1,
|
||||
author = {Klerke, Sigrid and Goldberg, Yoav and S{\o}gaard, Anders},
|
||||
title = {Improving sentence compression by learning to predict gaze},
|
||||
booktitle = {Proceedings of the 2016 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies},
|
||||
month = {June},
|
||||
year = {2016},
|
||||
address = {San Diego, California},
|
||||
publisher = {Association for Computational Linguistics},
|
||||
pages = {1528--1533},
|
||||
url = {http://www.aclweb.org/anthology/N16-1179}
|
||||
}
|
||||
```
|
|
@ -0,0 +1 @@
|
|||
from .main import *
|
|
@ -0,0 +1,95 @@
|
|||
from torchtext import data
|
||||
from const import Phase
|
||||
|
||||
|
||||
def create_dataset(data: dict, batch_size: int, device: int):
|
||||
|
||||
train = Dataset(data[Phase.TRAIN]['tokens'],
|
||||
data[Phase.TRAIN]['labels'],
|
||||
vocab=None,
|
||||
batch_size=batch_size,
|
||||
device=device,
|
||||
phase=Phase.TRAIN)
|
||||
|
||||
dev = Dataset(data[Phase.DEV]['tokens'],
|
||||
data[Phase.DEV]['labels'],
|
||||
vocab=train.vocab,
|
||||
batch_size=batch_size,
|
||||
device=device,
|
||||
phase=Phase.DEV)
|
||||
|
||||
test = Dataset(data[Phase.TEST]['tokens'],
|
||||
data[Phase.TEST]['labels'],
|
||||
vocab=train.vocab,
|
||||
batch_size=batch_size,
|
||||
device=device,
|
||||
phase=Phase.TEST)
|
||||
return train, dev, test
|
||||
|
||||
|
||||
class Dataset:
|
||||
def __init__(self,
|
||||
tokens: list,
|
||||
label_list: list,
|
||||
vocab: list,
|
||||
batch_size: int,
|
||||
device: int,
|
||||
phase: Phase):
|
||||
assert len(tokens) == len(label_list), \
|
||||
'the number of sentences and the number of POS/head sequences \
|
||||
should be the same length'
|
||||
|
||||
self.pad_token = '<PAD>'
|
||||
# self.unk_token = '<UNK>'
|
||||
self.tokens = tokens
|
||||
self.label_list = label_list
|
||||
self.sentence_id = [[i] for i in range(len(tokens))]
|
||||
self.device = device
|
||||
|
||||
self.token_field = data.Field(use_vocab=True,
|
||||
# unk_token=self.unk_token,
|
||||
pad_token=self.pad_token,
|
||||
batch_first=True)
|
||||
self.label_field = data.Field(use_vocab=False, pad_token=-1, batch_first=True)
|
||||
self.sentence_id_field = data.Field(use_vocab=False, batch_first=True)
|
||||
self.dataset = self._create_dataset()
|
||||
|
||||
if vocab is None:
|
||||
self.token_field.build_vocab(self.tokens)
|
||||
self.vocab = self.token_field.vocab
|
||||
else:
|
||||
self.token_field.vocab = vocab
|
||||
self.vocab = vocab
|
||||
self.pad_index = self.token_field.vocab.stoi[self.pad_token]
|
||||
|
||||
self._set_batch_iter(batch_size, phase)
|
||||
|
||||
def get_raw_sentence(self, sentences):
|
||||
return [[self.vocab.itos[idx] for idx in sentence]
|
||||
for sentence in sentences]
|
||||
|
||||
def _create_dataset(self):
|
||||
_fields = [('token', self.token_field),
|
||||
('label', self.label_field),
|
||||
('sentence_id', self.sentence_id_field)]
|
||||
return data.Dataset(self._get_examples(_fields), _fields)
|
||||
|
||||
def _get_examples(self, fields: list):
|
||||
ex = []
|
||||
for sentence, label, sentence_id in zip(self.tokens, self.label_list, self.sentence_id):
|
||||
ex.append(data.Example.fromlist([sentence, label, sentence_id], fields))
|
||||
return ex
|
||||
|
||||
def _set_batch_iter(self, batch_size: int, phase: Phase):
|
||||
|
||||
def sort(data: data.Dataset) -> int:
|
||||
return len(getattr(data, 'token'))
|
||||
|
||||
train = True if phase == Phase.TRAIN else False
|
||||
|
||||
self.batch_iter = data.BucketIterator(dataset=self.dataset,
|
||||
batch_size=batch_size,
|
||||
sort_key=sort,
|
||||
train=train,
|
||||
repeat=False,
|
||||
device=self.device)
|
|
@ -0,0 +1,8 @@
|
|||
from enum import Enum, unique
|
||||
|
||||
|
||||
@unique
|
||||
class Phase(Enum):
|
||||
TRAIN = 'train'
|
||||
DEV = 'dev'
|
||||
TEST = 'test'
|
|
@ -0,0 +1,92 @@
|
|||
import torch
|
||||
import torch.nn as nn
|
||||
from torch.autograd import Variable
|
||||
|
||||
|
||||
class Network(nn.Module):
|
||||
def __init__(self,
|
||||
embeddings,
|
||||
hidden_size: int,
|
||||
prior,
|
||||
device: torch.device):
|
||||
|
||||
super(Network, self).__init__()
|
||||
self.device = device
|
||||
self.priors = torch.log(torch.tensor([prior, 1-prior])).to(device)
|
||||
self.hidden_size = hidden_size
|
||||
self.bilstm_layers = 3
|
||||
self.bilstm_input_size = 300
|
||||
self.bilstm_output_size = 2 * hidden_size
|
||||
self.word_emb = nn.Embedding.from_pretrained(embeddings, freeze=False)
|
||||
self.bilstm = nn.LSTM(self.bilstm_input_size,
|
||||
self.hidden_size,
|
||||
num_layers=self.bilstm_layers,
|
||||
batch_first=True,
|
||||
dropout=0.1, #ms best mod 0.1
|
||||
bidirectional=True)
|
||||
self.dropout = nn.Dropout(p=0.35)
|
||||
if self.attention:
|
||||
self.attention_size = self.bilstm_output_size * 2
|
||||
self.u_a = nn.Linear(self.bilstm_output_size, self.bilstm_output_size)
|
||||
self.w_a = nn.Linear(self.bilstm_output_size, self.bilstm_output_size)
|
||||
self.v_a_inv = nn.Linear(self.bilstm_output_size, 1, bias=False)
|
||||
self.linear_attn = nn.Linear(self.attention_size, self.bilstm_output_size)
|
||||
self.linear = nn.Linear(self.bilstm_output_size, self.hidden_size)
|
||||
self.pred = nn.Linear(self.hidden_size, 2)
|
||||
self.softmax = nn.LogSoftmax(dim=1)
|
||||
self.criterion = nn.NLLLoss(ignore_index=-1)
|
||||
|
||||
def forward(self, input_tokens, labels, fixations=None):
|
||||
loss = 0.0
|
||||
preds = []
|
||||
atts = []
|
||||
batch_size, seq_len = input_tokens.size()
|
||||
self.init_hidden(batch_size, device=self.device)
|
||||
|
||||
x_i = self.word_emb(input_tokens)
|
||||
x_i = self.dropout(x_i)
|
||||
|
||||
hidden, (self.h_n, self.c_n) = self.bilstm(x_i, (self.h_n, self.c_n))
|
||||
_, _, hidden_size = hidden.size()
|
||||
|
||||
for i in range(seq_len):
|
||||
nth_hidden = hidden[:, i, :]
|
||||
if self.attention:
|
||||
target = nth_hidden.expand(seq_len, batch_size, -1).transpose(0, 1)
|
||||
mask = hidden.eq(target)[:, :, 0].unsqueeze(2)
|
||||
attn_weight = self.attention(hidden, target, fixations, mask)
|
||||
context_vector = torch.bmm(attn_weight.transpose(1, 2), hidden).squeeze(1)
|
||||
|
||||
nth_hidden = torch.tanh(self.linear_attn(torch.cat((nth_hidden, context_vector), -1)))
|
||||
atts.append(attn_weight.detach().cpu())
|
||||
logits = self.pred(self.linear(nth_hidden))
|
||||
if not self.training:
|
||||
logits = logits + self.priors
|
||||
output = self.softmax(logits)
|
||||
loss += self.criterion(output, labels[:, i])
|
||||
|
||||
_, topi = output.topk(k=1, dim=1)
|
||||
pred = topi.squeeze(-1)
|
||||
preds.append(pred)
|
||||
|
||||
preds = torch.stack(torch.cat(preds, dim=0).split(batch_size), dim=1)
|
||||
|
||||
if atts:
|
||||
atts = torch.stack(torch.cat(atts, dim=0).split(batch_size), dim=1)
|
||||
|
||||
return loss, preds, atts
|
||||
|
||||
def attention(self, source, target, fixations=None, mask=None):
|
||||
function_g = \
|
||||
self.v_a_inv(torch.tanh(self.u_a(source) + self.w_a(target)))
|
||||
if mask is not None:
|
||||
function_g.masked_fill_(mask, -1e4)
|
||||
if fixations is not None:
|
||||
function_g = function_g*fixations
|
||||
return nn.functional.softmax(function_g, dim=1)
|
||||
|
||||
def init_hidden(self, batch_size, device):
|
||||
zeros = Variable(torch.zeros(2*self.bilstm_layers, batch_size, self.hidden_size))
|
||||
self.h_n = zeros.to(device)
|
||||
self.c_n = zeros.to(device)
|
||||
return self.h_n, self.c_n
|
|
@ -0,0 +1,183 @@
|
|||
import torch
|
||||
from torch import optim
|
||||
import tqdm
|
||||
|
||||
from const import Phase
|
||||
from batch import create_dataset
|
||||
from models import Baseline
|
||||
from sklearn.metrics import classification_report
|
||||
|
||||
|
||||
def run(dataset_train,
|
||||
dataset_dev,
|
||||
dataset_test,
|
||||
model_type,
|
||||
word_embed_size,
|
||||
hidden_size,
|
||||
batch_size,
|
||||
device,
|
||||
n_epochs):
|
||||
|
||||
if model_type == 'base':
|
||||
model = Baseline(vocab=dataset_train.vocab,
|
||||
word_embed_size=word_embed_size,
|
||||
hidden_size=hidden_size,
|
||||
device=device,
|
||||
inference=False)
|
||||
else:
|
||||
raise NotImplementedError
|
||||
model = model.to(device)
|
||||
|
||||
optim_params = model.parameters()
|
||||
optimizer = optim.Adam(optim_params, lr=10**-3)
|
||||
|
||||
print('start training')
|
||||
for epoch in range(n_epochs):
|
||||
train_loss, tokens, preds, golds = train(dataset_train,
|
||||
model,
|
||||
optimizer,
|
||||
batch_size,
|
||||
epoch,
|
||||
Phase.TRAIN,
|
||||
device)
|
||||
|
||||
dev_loss, tokens, preds, golds = train(dataset_dev,
|
||||
model,
|
||||
optimizer,
|
||||
batch_size,
|
||||
epoch,
|
||||
Phase.DEV,
|
||||
device)
|
||||
logger = '\t'.join(['epoch {}'.format(epoch+1),
|
||||
'TRAIN Loss: {:.9f}'.format(train_loss),
|
||||
'DEV Loss: {:.9f}'.format(dev_loss)])
|
||||
# print('\r'+logger, end='')
|
||||
print(logger)
|
||||
test_loss, tokens, preds, golds = train(dataset_test,
|
||||
model,
|
||||
optimizer,
|
||||
batch_size,
|
||||
epoch,
|
||||
Phase.TEST,
|
||||
device)
|
||||
print('====', 'TEST', '=====')
|
||||
print_scores(preds, golds)
|
||||
output_results(tokens, preds, golds)
|
||||
|
||||
|
||||
def train(dataset,
|
||||
model,
|
||||
optimizer,
|
||||
batch_size,
|
||||
n_epoch,
|
||||
phase,
|
||||
device):
|
||||
|
||||
total_loss = 0.0
|
||||
tokens = []
|
||||
preds = []
|
||||
labels = []
|
||||
if phase == Phase.TRAIN:
|
||||
model.train()
|
||||
else:
|
||||
model.eval()
|
||||
|
||||
for batch in tqdm.tqdm(dataset.batch_iter):
|
||||
token = getattr(batch, 'token')
|
||||
label = getattr(batch, 'label')
|
||||
raw_sentences = dataset.get_raw_sentence(token.data.detach().cpu().numpy())
|
||||
|
||||
loss, pred = \
|
||||
model(token, raw_sentences, label, phase)
|
||||
|
||||
if phase == Phase.TRAIN:
|
||||
optimizer.zero_grad()
|
||||
torch.nn.utils.clip_grad_norm(model.parameters(), max_norm=5)
|
||||
loss.backward()
|
||||
optimizer.step()
|
||||
|
||||
# remove PAD from input sentences/labels and results
|
||||
mask = (token != dataset.pad_index)
|
||||
length_tensor = mask.sum(1)
|
||||
length_tensor = length_tensor.data.detach().cpu().numpy()
|
||||
|
||||
for index, n_tokens_in_the_sentence in enumerate(length_tensor):
|
||||
if n_tokens_in_the_sentence > 0:
|
||||
tokens.append(raw_sentences[index][:n_tokens_in_the_sentence])
|
||||
_label = label[index][:n_tokens_in_the_sentence]
|
||||
_pred = pred[index][:n_tokens_in_the_sentence]
|
||||
_label = _label.data.detach().cpu().numpy()
|
||||
_pred = _pred.data.detach().cpu().numpy()
|
||||
labels.append(_label)
|
||||
preds.append(_pred)
|
||||
|
||||
total_loss += torch.mean(loss).item()
|
||||
|
||||
return total_loss, tokens, preds, labels
|
||||
|
||||
|
||||
def read_two_cols_data(fname, max_len=None):
|
||||
data = {}
|
||||
tokens = []
|
||||
labels = []
|
||||
token = []
|
||||
label = []
|
||||
with open(fname, mode='r') as f:
|
||||
for line in f:
|
||||
line = line.strip().lower().split()
|
||||
if line:
|
||||
try:
|
||||
_token, _label = line
|
||||
except ValueError:
|
||||
raise
|
||||
token.append(_token)
|
||||
if _label == '0' or _label == '1':
|
||||
label.append(int(_label))
|
||||
else:
|
||||
if _label == 'del':
|
||||
label.append(1)
|
||||
else:
|
||||
label.append(0)
|
||||
else:
|
||||
if max_len is None or len(token) <= max_len:
|
||||
tokens.append(token)
|
||||
labels.append(label)
|
||||
token = []
|
||||
label = []
|
||||
|
||||
data['tokens'] = tokens
|
||||
data['labels'] = labels
|
||||
return data
|
||||
|
||||
|
||||
def load(train_path, dev_path, test_path, batch_size, max_len, device):
|
||||
train = read_two_cols_data(train_path, max_len)
|
||||
dev = read_two_cols_data(dev_path)
|
||||
test = read_two_cols_data(test_path)
|
||||
data = {Phase.TRAIN: train, Phase.DEV: dev, Phase.TEST: test}
|
||||
return create_dataset(data, batch_size=batch_size, device=device)
|
||||
|
||||
|
||||
def print_scores(preds, golds):
|
||||
_preds = [label for sublist in preds for label in sublist]
|
||||
_golds = [label for sublist in golds for label in sublist]
|
||||
target_names = ['not_del', 'del']
|
||||
print(classification_report(_golds, _preds, target_names=target_names, digits=5))
|
||||
|
||||
|
||||
def output_results(tokens, preds, golds, path='./result/sentcomp'):
|
||||
with open(path+'.original.txt', mode='w') as w, \
|
||||
open(path+'.gold.txt', mode='w') as w_gold, \
|
||||
open(path+'.pred.txt', mode='w') as w_pred:
|
||||
|
||||
for _tokens, _golds, _preds in zip(tokens, golds, preds):
|
||||
for token, gold, pred in zip(_tokens, _golds, _preds):
|
||||
w.write(token + ' ')
|
||||
if gold == 0:
|
||||
w_gold.write(token + ' ')
|
||||
# 0 -> keep, 1 -> delete
|
||||
if pred == 0:
|
||||
w_pred.write(token + ' ')
|
||||
w.write('\n')
|
||||
w_gold.write('\n')
|
||||
w_pred.write('\n')
|
Loading…
Add table
Add a link
Reference in a new issue