Add NLP task models
This commit is contained in:
parent
d8beb17dfb
commit
69f6de0ace
46 changed files with 4976 additions and 0 deletions
0
joint_sentence_compression_model/libs/__init__.py
Normal file
0
joint_sentence_compression_model/libs/__init__.py
Normal file
97
joint_sentence_compression_model/libs/corpora.py
Normal file
97
joint_sentence_compression_model/libs/corpora.py
Normal file
|
@ -0,0 +1,97 @@
|
|||
import logging
|
||||
|
||||
import config
|
||||
|
||||
|
||||
def tokenize(sent):
|
||||
return sent.split(" ")
|
||||
|
||||
|
||||
class Lang:
|
||||
"""Represents the vocabulary
|
||||
"""
|
||||
def __init__(self, name):
|
||||
self.name = name
|
||||
self.word2index = {
|
||||
config.PAD: 0,
|
||||
config.UNK: 1,
|
||||
}
|
||||
self.word2count = {}
|
||||
self.index2word = {
|
||||
0: config.PAD,
|
||||
1: config.UNK,
|
||||
}
|
||||
self.n_words = 2
|
||||
|
||||
def add_sentence(self, sentence):
|
||||
assert isinstance(
|
||||
sentence, (list, tuple)
|
||||
), "input to add_sentence must be tokenized"
|
||||
for word in sentence:
|
||||
self.add_word(word)
|
||||
|
||||
def add_word(self, word):
|
||||
if word not in self.word2index:
|
||||
self.word2index[word] = self.n_words
|
||||
self.word2count[word] = 1
|
||||
self.index2word[self.n_words] = word
|
||||
self.n_words += 1
|
||||
else:
|
||||
self.word2count[word] += 1
|
||||
|
||||
def __add__(self, other):
|
||||
"""Returns a new Lang object containing the vocabulary from this and
|
||||
the other Lang object
|
||||
"""
|
||||
new_lang = Lang(f"{self.name}_{other.name}")
|
||||
|
||||
# Add vocabulary from both Langs
|
||||
for word in self.word2count.keys():
|
||||
new_lang.add_word(word)
|
||||
for word in other.word2count.keys():
|
||||
new_lang.add_word(word)
|
||||
|
||||
# Fix the counts on the new one
|
||||
for word in new_lang.word2count.keys():
|
||||
new_lang.word2count[word] = self.word2count.get(
|
||||
word, 0
|
||||
) + other.word2count.get(word, 0)
|
||||
|
||||
return new_lang
|
||||
|
||||
|
||||
def load_google(split, max_len=None):
|
||||
"""Load the Google Sentence Compression Dataset"""
|
||||
logger = logging.getLogger(f"{__name__}.load_compression")
|
||||
lang = Lang("compression")
|
||||
|
||||
if split == "train":
|
||||
path = config.google_train_path
|
||||
elif split == "val":
|
||||
path = config.google_dev_path
|
||||
elif split == "test":
|
||||
path = config.google_test_path
|
||||
|
||||
logger.info("loading %s from %s" % (split, path))
|
||||
|
||||
data = []
|
||||
sent = []
|
||||
mask = []
|
||||
with open(path) as handle:
|
||||
for line in handle:
|
||||
line = line.strip()
|
||||
if line:
|
||||
w, d = line.split("\t")
|
||||
sent.append(w)
|
||||
mask.append(int(d))
|
||||
else:
|
||||
if sent and (max_len is None or len(sent) <= max_len):
|
||||
data.append([sent, mask])
|
||||
lang.add_sentence(sent)
|
||||
sent = []
|
||||
mask = []
|
||||
if sent:
|
||||
data.append([tuple(sent), tuple(mask)])
|
||||
lang.add_sentence(sent)
|
||||
|
||||
return data, lang
|
|
@ -0,0 +1 @@
|
|||
from .main import *
|
|
@ -0,0 +1,125 @@
|
|||
from collections import OrderedDict
|
||||
import logging
|
||||
import sys
|
||||
|
||||
from .self_attention import Transformer
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
import torch.nn.functional as F
|
||||
from torch.nn.utils.rnn import pack_sequence, pack_padded_sequence, pad_packed_sequence, pad_sequence
|
||||
|
||||
|
||||
def random_embedding(vocab_size, embedding_dim):
|
||||
pretrain_emb = np.empty([vocab_size, embedding_dim])
|
||||
scale = np.sqrt(3.0 / embedding_dim)
|
||||
for index in range(vocab_size):
|
||||
pretrain_emb[index, :] = np.random.uniform(-scale, scale, [1, embedding_dim])
|
||||
return pretrain_emb
|
||||
|
||||
|
||||
def neg_log_likelihood_loss(outputs, batch_label, batch_size, seq_len):
|
||||
outputs = outputs.view(batch_size * seq_len, -1)
|
||||
score = F.log_softmax(outputs, 1)
|
||||
|
||||
loss = nn.NLLLoss(ignore_index=0, size_average=False)(
|
||||
score, batch_label.view(batch_size * seq_len)
|
||||
)
|
||||
loss = loss / batch_size
|
||||
_, tag_seq = torch.max(score, 1)
|
||||
tag_seq = tag_seq.view(batch_size, seq_len)
|
||||
|
||||
return loss, tag_seq
|
||||
|
||||
|
||||
def mse_loss(outputs, batch_label, batch_size, seq_len, word_seq_length):
|
||||
score = torch.sigmoid(outputs)
|
||||
|
||||
mask = torch.zeros_like(score)
|
||||
for i, v in enumerate(word_seq_length):
|
||||
mask[i, 0:v] = 1
|
||||
|
||||
score = score * mask
|
||||
|
||||
loss = nn.MSELoss(reduction="sum")(
|
||||
score.view(batch_size, seq_len), batch_label.view(batch_size, seq_len)
|
||||
)
|
||||
|
||||
loss = loss / batch_size
|
||||
|
||||
return loss, score.view(batch_size, seq_len)
|
||||
|
||||
|
||||
class Network(nn.Module):
|
||||
def __init__(
|
||||
self,
|
||||
embedding_type,
|
||||
vocab_size,
|
||||
embedding_dim,
|
||||
dropout,
|
||||
hidden_dim,
|
||||
embeddings=None,
|
||||
attention=True,
|
||||
):
|
||||
super().__init__()
|
||||
self.logger = logging.getLogger(f"{__name__}")
|
||||
self.attention = attention
|
||||
prelayers = OrderedDict()
|
||||
postlayers = OrderedDict()
|
||||
|
||||
if embedding_type in ("w2v", "glove"):
|
||||
if embeddings is not None:
|
||||
prelayers["embedding_layer"] = nn.Embedding.from_pretrained(embeddings, freeze=True)
|
||||
else:
|
||||
prelayers["embedding_layer"] = nn.Embedding(vocab_size, embedding_dim)
|
||||
prelayers["embedding_dropout_layer"] = nn.Dropout(dropout)
|
||||
embedding_dim = 300
|
||||
elif embedding_type == "bert":
|
||||
embedding_dim = 768
|
||||
|
||||
self.lstm = BiLSTM(embedding_dim, hidden_dim // 2, num_layers=1)
|
||||
postlayers["lstm_dropout_layer"] = nn.Dropout(dropout)
|
||||
|
||||
if self.attention:
|
||||
postlayers["attention_layer"] = Transformer(
|
||||
d_model=hidden_dim, n_heads=4, n_layers=1
|
||||
)
|
||||
|
||||
postlayers["ff_layer"] = nn.Linear(hidden_dim, hidden_dim // 2)
|
||||
postlayers["ff_activation"] = nn.ReLU()
|
||||
postlayers["output_layer"] = nn.Linear(hidden_dim // 2, 1)
|
||||
|
||||
self.logger.info(f"prelayers: {prelayers.keys()}")
|
||||
self.logger.info(f"postlayers: {postlayers.keys()}")
|
||||
|
||||
self.pre = nn.Sequential(prelayers)
|
||||
self.post = nn.Sequential(postlayers)
|
||||
|
||||
def forward(self, x, word_seq_length):
|
||||
x = self.pre(x)
|
||||
x = self.lstm(x, word_seq_length)
|
||||
|
||||
output = []
|
||||
for _x, l in zip(x.transpose(1, 0), word_seq_length):
|
||||
output.append(self.post(_x[:l].unsqueeze(0))[0])
|
||||
|
||||
return pad_sequence(output, batch_first=True)
|
||||
|
||||
|
||||
class BiLSTM(nn.Module):
|
||||
def __init__(self, embedding_dim, lstm_hidden, num_layers):
|
||||
super().__init__()
|
||||
self.net = nn.LSTM(
|
||||
input_size=embedding_dim,
|
||||
hidden_size=lstm_hidden,
|
||||
num_layers=num_layers,
|
||||
batch_first=True,
|
||||
bidirectional=True,
|
||||
)
|
||||
|
||||
def forward(self, x, word_seq_length):
|
||||
packed_words = pack_padded_sequence(x, word_seq_length, True, False)
|
||||
lstm_out, hidden = self.net(packed_words)
|
||||
lstm_out, _ = pad_packed_sequence(lstm_out)
|
||||
return lstm_out
|
|
@ -0,0 +1,128 @@
|
|||
import torch
|
||||
import torch.nn as nn
|
||||
import torch.nn.functional as F
|
||||
|
||||
import numpy as np
|
||||
|
||||
import math
|
||||
|
||||
|
||||
class PositionalEncoding(nn.Module):
|
||||
def __init__(self, d_hid, n_position=200):
|
||||
super(PositionalEncoding, self).__init__()
|
||||
|
||||
self.register_buffer('pos_table', self._get_sinusoid_encoding_table(n_position, d_hid))
|
||||
|
||||
def _get_sinusoid_encoding_table(self, n_position, d_hid):
|
||||
''' Sinusoid position encoding table '''
|
||||
def get_position_angle_vec(position):
|
||||
return [position / np.power(10000, 2 * (hid_j // 2) / d_hid) for hid_j in range(d_hid)]
|
||||
|
||||
sinusoid_table = np.array([get_position_angle_vec(pos_i) for pos_i in range(n_position)])
|
||||
sinusoid_table[:, 0::2] = np.sin(sinusoid_table[:, 0::2]) # dim 2i
|
||||
sinusoid_table[:, 1::2] = np.cos(sinusoid_table[:, 1::2]) # dim 2i+1
|
||||
|
||||
return torch.FloatTensor(sinusoid_table).unsqueeze(0)
|
||||
|
||||
def forward(self, x):
|
||||
return x + self.pos_table[:, :x.size(1)].clone().detach()
|
||||
|
||||
|
||||
class AttentionLayer(nn.Module):
|
||||
def __init__(self):
|
||||
super(AttentionLayer, self).__init__()
|
||||
|
||||
def forward(self, Q, K, V):
|
||||
# Q: float32:[batch_size, n_queries, d_k]
|
||||
# K: float32:[batch_size, n_keys, d_k]
|
||||
# V: float32:[batch_size, n_keys, d_v]
|
||||
dk = K.shape[-1]
|
||||
dv = V.shape[-1]
|
||||
KT = torch.transpose(K, -1, -2)
|
||||
weight_logits = torch.bmm(Q, KT) / math.sqrt(dk)
|
||||
# weight_logits: float32[batch_size, n_queries, n_keys]
|
||||
weights = F.softmax(weight_logits, dim=-1)
|
||||
# weight: float32[batch_size, n_queries, n_keys]
|
||||
return torch.bmm(weights, V)
|
||||
# return float32[batch_size, n_queries, dv]
|
||||
|
||||
|
||||
class MultiHeadedSelfAttentionLayer(nn.Module):
|
||||
def __init__(self, d_model, n_heads):
|
||||
super(MultiHeadedSelfAttentionLayer, self).__init__()
|
||||
self.d_model = d_model
|
||||
self.n_heads = n_heads
|
||||
print('{} {}'.format(d_model, n_heads))
|
||||
assert d_model % n_heads == 0
|
||||
self.d_k = d_model // n_heads
|
||||
self.d_v = self.d_k
|
||||
self.attention_layer = AttentionLayer()
|
||||
self.W_Qs = nn.ModuleList([
|
||||
nn.Linear(d_model, self.d_k, bias=False)
|
||||
for _ in range(n_heads)
|
||||
])
|
||||
self.W_Ks = nn.ModuleList([
|
||||
nn.Linear(d_model, self.d_k, bias=False)
|
||||
for _ in range(n_heads)
|
||||
])
|
||||
self.W_Vs = nn.ModuleList([
|
||||
nn.Linear(d_model, self.d_v, bias=False)
|
||||
for _ in range(n_heads)
|
||||
])
|
||||
self.W_O = nn.Linear(d_model, d_model, bias=False)
|
||||
|
||||
def forward(self, x):
|
||||
# x:float32[batch_size, sequence_length, self.d_model]
|
||||
head_outputs = []
|
||||
for W_Q, W_K, W_V in zip(self.W_Qs, self.W_Ks, self.W_Vs):
|
||||
Q = W_Q(x)
|
||||
# Q float32:[batch_size, sequence_length, self.d_k]
|
||||
K = W_K(x)
|
||||
# Q float32:[batch_size, sequence_length, self.d_k]
|
||||
V = W_V(x)
|
||||
# Q float32:[batch_size, sequence_length, self.d_v]
|
||||
head_output = self.attention_layer(Q, K, V)
|
||||
# float32:[batch_size, sequence_length, self.d_v]
|
||||
head_outputs.append(head_output)
|
||||
concatenated = torch.cat(head_outputs, dim=-1)
|
||||
# concatenated float32:[batch_size, sequence_length, self.d_model]
|
||||
out = self.W_O(concatenated)
|
||||
# out float32:[batch_size, sequence_length, self.d_model]
|
||||
return out
|
||||
|
||||
class Feedforward(nn.Module):
|
||||
def __init__(self, d_model):
|
||||
super(Feedforward, self).__init__()
|
||||
self.d_model = d_model
|
||||
self.W1 = nn.Linear(d_model, d_model)
|
||||
self.W2 = nn.Linear(d_model, d_model)
|
||||
|
||||
def forward(self, x):
|
||||
# x: float32[batch_size, sequence_length, d_model]
|
||||
return self.W2(torch.relu(self.W1(x)))
|
||||
|
||||
class Transformer(nn.Module):
|
||||
def __init__(self, d_model, n_heads, n_layers):
|
||||
super(Transformer, self).__init__()
|
||||
self.d_model = d_model
|
||||
self.n_heads = n_heads
|
||||
self.n_layers = n_layers
|
||||
self.attention_layers = nn.ModuleList([
|
||||
MultiHeadedSelfAttentionLayer(d_model, n_heads)
|
||||
for _ in range(n_layers)
|
||||
])
|
||||
self.ffs = nn.ModuleList([
|
||||
Feedforward(d_model)
|
||||
for _ in range(n_layers)
|
||||
])
|
||||
|
||||
def forward(self, x):
|
||||
# x: float32[batch_size, sequence_length, self.d_model]
|
||||
for attention_layer, ff in zip(self.attention_layers, self.ffs):
|
||||
attention_out = attention_layer(x)
|
||||
# attention_out: float32[batch_size, sequence_length, self.d_model]
|
||||
x = F.layer_norm(x + attention_out, x.shape[2:])
|
||||
ff_out = ff(x)
|
||||
# ff_out: float32[batch_size, sequence_length, self.d_model]
|
||||
x = F.layer_norm(x + ff_out, x.shape[2:])
|
||||
return x
|
|
@ -0,0 +1,29 @@
|
|||
BSD 3-Clause License
|
||||
|
||||
Copyright (c) 2018, Tatsuya Aoki
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright notice, this
|
||||
list of conditions and the following disclaimer.
|
||||
|
||||
* Redistributions in binary form must reproduce the above copyright notice,
|
||||
this list of conditions and the following disclaimer in the documentation
|
||||
and/or other materials provided with the distribution.
|
||||
|
||||
* Neither the name of the copyright holder nor the names of its
|
||||
contributors may be used to endorse or promote products derived from
|
||||
this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
@ -0,0 +1,31 @@
|
|||
# Simple Model for Sentence Compression
|
||||
3-layered BILSTM model for sentence compression, referred as Baseline in [Klerke et al., NAACL 2016](http://aclweb.org/anthology/N/N16/N16-1179.pdf).
|
||||
## Requirements
|
||||
### Framework
|
||||
- python (<= 3.6)
|
||||
- pytorch (<= 0.3.0)
|
||||
|
||||
### Packages
|
||||
- torchtext
|
||||
|
||||
## How to run
|
||||
```
|
||||
./getdata
|
||||
python main.py
|
||||
```
|
||||
To run the scripts with gpu, use this command `python main.py --gpu-id ID`, which ID is the integer from 0 to the number of gpus what you have.
|
||||
## Reference
|
||||
|
||||
```
|
||||
@InProceedings{klerke-goldberg-sogaard:2016:N16-1,
|
||||
author = {Klerke, Sigrid and Goldberg, Yoav and S{\o}gaard, Anders},
|
||||
title = {Improving sentence compression by learning to predict gaze},
|
||||
booktitle = {Proceedings of the 2016 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies},
|
||||
month = {June},
|
||||
year = {2016},
|
||||
address = {San Diego, California},
|
||||
publisher = {Association for Computational Linguistics},
|
||||
pages = {1528--1533},
|
||||
url = {http://www.aclweb.org/anthology/N16-1179}
|
||||
}
|
||||
```
|
|
@ -0,0 +1 @@
|
|||
from .main import *
|
|
@ -0,0 +1,95 @@
|
|||
from torchtext import data
|
||||
from const import Phase
|
||||
|
||||
|
||||
def create_dataset(data: dict, batch_size: int, device: int):
|
||||
|
||||
train = Dataset(data[Phase.TRAIN]['tokens'],
|
||||
data[Phase.TRAIN]['labels'],
|
||||
vocab=None,
|
||||
batch_size=batch_size,
|
||||
device=device,
|
||||
phase=Phase.TRAIN)
|
||||
|
||||
dev = Dataset(data[Phase.DEV]['tokens'],
|
||||
data[Phase.DEV]['labels'],
|
||||
vocab=train.vocab,
|
||||
batch_size=batch_size,
|
||||
device=device,
|
||||
phase=Phase.DEV)
|
||||
|
||||
test = Dataset(data[Phase.TEST]['tokens'],
|
||||
data[Phase.TEST]['labels'],
|
||||
vocab=train.vocab,
|
||||
batch_size=batch_size,
|
||||
device=device,
|
||||
phase=Phase.TEST)
|
||||
return train, dev, test
|
||||
|
||||
|
||||
class Dataset:
|
||||
def __init__(self,
|
||||
tokens: list,
|
||||
label_list: list,
|
||||
vocab: list,
|
||||
batch_size: int,
|
||||
device: int,
|
||||
phase: Phase):
|
||||
assert len(tokens) == len(label_list), \
|
||||
'the number of sentences and the number of POS/head sequences \
|
||||
should be the same length'
|
||||
|
||||
self.pad_token = '<PAD>'
|
||||
# self.unk_token = '<UNK>'
|
||||
self.tokens = tokens
|
||||
self.label_list = label_list
|
||||
self.sentence_id = [[i] for i in range(len(tokens))]
|
||||
self.device = device
|
||||
|
||||
self.token_field = data.Field(use_vocab=True,
|
||||
# unk_token=self.unk_token,
|
||||
pad_token=self.pad_token,
|
||||
batch_first=True)
|
||||
self.label_field = data.Field(use_vocab=False, pad_token=-1, batch_first=True)
|
||||
self.sentence_id_field = data.Field(use_vocab=False, batch_first=True)
|
||||
self.dataset = self._create_dataset()
|
||||
|
||||
if vocab is None:
|
||||
self.token_field.build_vocab(self.tokens)
|
||||
self.vocab = self.token_field.vocab
|
||||
else:
|
||||
self.token_field.vocab = vocab
|
||||
self.vocab = vocab
|
||||
self.pad_index = self.token_field.vocab.stoi[self.pad_token]
|
||||
|
||||
self._set_batch_iter(batch_size, phase)
|
||||
|
||||
def get_raw_sentence(self, sentences):
|
||||
return [[self.vocab.itos[idx] for idx in sentence]
|
||||
for sentence in sentences]
|
||||
|
||||
def _create_dataset(self):
|
||||
_fields = [('token', self.token_field),
|
||||
('label', self.label_field),
|
||||
('sentence_id', self.sentence_id_field)]
|
||||
return data.Dataset(self._get_examples(_fields), _fields)
|
||||
|
||||
def _get_examples(self, fields: list):
|
||||
ex = []
|
||||
for sentence, label, sentence_id in zip(self.tokens, self.label_list, self.sentence_id):
|
||||
ex.append(data.Example.fromlist([sentence, label, sentence_id], fields))
|
||||
return ex
|
||||
|
||||
def _set_batch_iter(self, batch_size: int, phase: Phase):
|
||||
|
||||
def sort(data: data.Dataset) -> int:
|
||||
return len(getattr(data, 'token'))
|
||||
|
||||
train = True if phase == Phase.TRAIN else False
|
||||
|
||||
self.batch_iter = data.BucketIterator(dataset=self.dataset,
|
||||
batch_size=batch_size,
|
||||
sort_key=sort,
|
||||
train=train,
|
||||
repeat=False,
|
||||
device=self.device)
|
|
@ -0,0 +1,8 @@
|
|||
from enum import Enum, unique
|
||||
|
||||
|
||||
@unique
|
||||
class Phase(Enum):
|
||||
TRAIN = 'train'
|
||||
DEV = 'dev'
|
||||
TEST = 'test'
|
|
@ -0,0 +1,92 @@
|
|||
import torch
|
||||
import torch.nn as nn
|
||||
from torch.autograd import Variable
|
||||
|
||||
|
||||
class Network(nn.Module):
|
||||
def __init__(self,
|
||||
embeddings,
|
||||
hidden_size: int,
|
||||
prior,
|
||||
device: torch.device):
|
||||
|
||||
super(Network, self).__init__()
|
||||
self.device = device
|
||||
self.priors = torch.log(torch.tensor([prior, 1-prior])).to(device)
|
||||
self.hidden_size = hidden_size
|
||||
self.bilstm_layers = 3
|
||||
self.bilstm_input_size = 300
|
||||
self.bilstm_output_size = 2 * hidden_size
|
||||
self.word_emb = nn.Embedding.from_pretrained(embeddings, freeze=False)
|
||||
self.bilstm = nn.LSTM(self.bilstm_input_size,
|
||||
self.hidden_size,
|
||||
num_layers=self.bilstm_layers,
|
||||
batch_first=True,
|
||||
dropout=0.1, #ms best mod 0.1
|
||||
bidirectional=True)
|
||||
self.dropout = nn.Dropout(p=0.35)
|
||||
if self.attention:
|
||||
self.attention_size = self.bilstm_output_size * 2
|
||||
self.u_a = nn.Linear(self.bilstm_output_size, self.bilstm_output_size)
|
||||
self.w_a = nn.Linear(self.bilstm_output_size, self.bilstm_output_size)
|
||||
self.v_a_inv = nn.Linear(self.bilstm_output_size, 1, bias=False)
|
||||
self.linear_attn = nn.Linear(self.attention_size, self.bilstm_output_size)
|
||||
self.linear = nn.Linear(self.bilstm_output_size, self.hidden_size)
|
||||
self.pred = nn.Linear(self.hidden_size, 2)
|
||||
self.softmax = nn.LogSoftmax(dim=1)
|
||||
self.criterion = nn.NLLLoss(ignore_index=-1)
|
||||
|
||||
def forward(self, input_tokens, labels, fixations=None):
|
||||
loss = 0.0
|
||||
preds = []
|
||||
atts = []
|
||||
batch_size, seq_len = input_tokens.size()
|
||||
self.init_hidden(batch_size, device=self.device)
|
||||
|
||||
x_i = self.word_emb(input_tokens)
|
||||
x_i = self.dropout(x_i)
|
||||
|
||||
hidden, (self.h_n, self.c_n) = self.bilstm(x_i, (self.h_n, self.c_n))
|
||||
_, _, hidden_size = hidden.size()
|
||||
|
||||
for i in range(seq_len):
|
||||
nth_hidden = hidden[:, i, :]
|
||||
if self.attention:
|
||||
target = nth_hidden.expand(seq_len, batch_size, -1).transpose(0, 1)
|
||||
mask = hidden.eq(target)[:, :, 0].unsqueeze(2)
|
||||
attn_weight = self.attention(hidden, target, fixations, mask)
|
||||
context_vector = torch.bmm(attn_weight.transpose(1, 2), hidden).squeeze(1)
|
||||
|
||||
nth_hidden = torch.tanh(self.linear_attn(torch.cat((nth_hidden, context_vector), -1)))
|
||||
atts.append(attn_weight.detach().cpu())
|
||||
logits = self.pred(self.linear(nth_hidden))
|
||||
if not self.training:
|
||||
logits = logits + self.priors
|
||||
output = self.softmax(logits)
|
||||
loss += self.criterion(output, labels[:, i])
|
||||
|
||||
_, topi = output.topk(k=1, dim=1)
|
||||
pred = topi.squeeze(-1)
|
||||
preds.append(pred)
|
||||
|
||||
preds = torch.stack(torch.cat(preds, dim=0).split(batch_size), dim=1)
|
||||
|
||||
if atts:
|
||||
atts = torch.stack(torch.cat(atts, dim=0).split(batch_size), dim=1)
|
||||
|
||||
return loss, preds, atts
|
||||
|
||||
def attention(self, source, target, fixations=None, mask=None):
|
||||
function_g = \
|
||||
self.v_a_inv(torch.tanh(self.u_a(source) + self.w_a(target)))
|
||||
if mask is not None:
|
||||
function_g.masked_fill_(mask, -1e4)
|
||||
if fixations is not None:
|
||||
function_g = function_g*fixations
|
||||
return nn.functional.softmax(function_g, dim=1)
|
||||
|
||||
def init_hidden(self, batch_size, device):
|
||||
zeros = Variable(torch.zeros(2*self.bilstm_layers, batch_size, self.hidden_size))
|
||||
self.h_n = zeros.to(device)
|
||||
self.c_n = zeros.to(device)
|
||||
return self.h_n, self.c_n
|
|
@ -0,0 +1,183 @@
|
|||
import torch
|
||||
from torch import optim
|
||||
import tqdm
|
||||
|
||||
from const import Phase
|
||||
from batch import create_dataset
|
||||
from models import Baseline
|
||||
from sklearn.metrics import classification_report
|
||||
|
||||
|
||||
def run(dataset_train,
|
||||
dataset_dev,
|
||||
dataset_test,
|
||||
model_type,
|
||||
word_embed_size,
|
||||
hidden_size,
|
||||
batch_size,
|
||||
device,
|
||||
n_epochs):
|
||||
|
||||
if model_type == 'base':
|
||||
model = Baseline(vocab=dataset_train.vocab,
|
||||
word_embed_size=word_embed_size,
|
||||
hidden_size=hidden_size,
|
||||
device=device,
|
||||
inference=False)
|
||||
else:
|
||||
raise NotImplementedError
|
||||
model = model.to(device)
|
||||
|
||||
optim_params = model.parameters()
|
||||
optimizer = optim.Adam(optim_params, lr=10**-3)
|
||||
|
||||
print('start training')
|
||||
for epoch in range(n_epochs):
|
||||
train_loss, tokens, preds, golds = train(dataset_train,
|
||||
model,
|
||||
optimizer,
|
||||
batch_size,
|
||||
epoch,
|
||||
Phase.TRAIN,
|
||||
device)
|
||||
|
||||
dev_loss, tokens, preds, golds = train(dataset_dev,
|
||||
model,
|
||||
optimizer,
|
||||
batch_size,
|
||||
epoch,
|
||||
Phase.DEV,
|
||||
device)
|
||||
logger = '\t'.join(['epoch {}'.format(epoch+1),
|
||||
'TRAIN Loss: {:.9f}'.format(train_loss),
|
||||
'DEV Loss: {:.9f}'.format(dev_loss)])
|
||||
# print('\r'+logger, end='')
|
||||
print(logger)
|
||||
test_loss, tokens, preds, golds = train(dataset_test,
|
||||
model,
|
||||
optimizer,
|
||||
batch_size,
|
||||
epoch,
|
||||
Phase.TEST,
|
||||
device)
|
||||
print('====', 'TEST', '=====')
|
||||
print_scores(preds, golds)
|
||||
output_results(tokens, preds, golds)
|
||||
|
||||
|
||||
def train(dataset,
|
||||
model,
|
||||
optimizer,
|
||||
batch_size,
|
||||
n_epoch,
|
||||
phase,
|
||||
device):
|
||||
|
||||
total_loss = 0.0
|
||||
tokens = []
|
||||
preds = []
|
||||
labels = []
|
||||
if phase == Phase.TRAIN:
|
||||
model.train()
|
||||
else:
|
||||
model.eval()
|
||||
|
||||
for batch in tqdm.tqdm(dataset.batch_iter):
|
||||
token = getattr(batch, 'token')
|
||||
label = getattr(batch, 'label')
|
||||
raw_sentences = dataset.get_raw_sentence(token.data.detach().cpu().numpy())
|
||||
|
||||
loss, pred = \
|
||||
model(token, raw_sentences, label, phase)
|
||||
|
||||
if phase == Phase.TRAIN:
|
||||
optimizer.zero_grad()
|
||||
torch.nn.utils.clip_grad_norm(model.parameters(), max_norm=5)
|
||||
loss.backward()
|
||||
optimizer.step()
|
||||
|
||||
# remove PAD from input sentences/labels and results
|
||||
mask = (token != dataset.pad_index)
|
||||
length_tensor = mask.sum(1)
|
||||
length_tensor = length_tensor.data.detach().cpu().numpy()
|
||||
|
||||
for index, n_tokens_in_the_sentence in enumerate(length_tensor):
|
||||
if n_tokens_in_the_sentence > 0:
|
||||
tokens.append(raw_sentences[index][:n_tokens_in_the_sentence])
|
||||
_label = label[index][:n_tokens_in_the_sentence]
|
||||
_pred = pred[index][:n_tokens_in_the_sentence]
|
||||
_label = _label.data.detach().cpu().numpy()
|
||||
_pred = _pred.data.detach().cpu().numpy()
|
||||
labels.append(_label)
|
||||
preds.append(_pred)
|
||||
|
||||
total_loss += torch.mean(loss).item()
|
||||
|
||||
return total_loss, tokens, preds, labels
|
||||
|
||||
|
||||
def read_two_cols_data(fname, max_len=None):
|
||||
data = {}
|
||||
tokens = []
|
||||
labels = []
|
||||
token = []
|
||||
label = []
|
||||
with open(fname, mode='r') as f:
|
||||
for line in f:
|
||||
line = line.strip().lower().split()
|
||||
if line:
|
||||
try:
|
||||
_token, _label = line
|
||||
except ValueError:
|
||||
raise
|
||||
token.append(_token)
|
||||
if _label == '0' or _label == '1':
|
||||
label.append(int(_label))
|
||||
else:
|
||||
if _label == 'del':
|
||||
label.append(1)
|
||||
else:
|
||||
label.append(0)
|
||||
else:
|
||||
if max_len is None or len(token) <= max_len:
|
||||
tokens.append(token)
|
||||
labels.append(label)
|
||||
token = []
|
||||
label = []
|
||||
|
||||
data['tokens'] = tokens
|
||||
data['labels'] = labels
|
||||
return data
|
||||
|
||||
|
||||
def load(train_path, dev_path, test_path, batch_size, max_len, device):
|
||||
train = read_two_cols_data(train_path, max_len)
|
||||
dev = read_two_cols_data(dev_path)
|
||||
test = read_two_cols_data(test_path)
|
||||
data = {Phase.TRAIN: train, Phase.DEV: dev, Phase.TEST: test}
|
||||
return create_dataset(data, batch_size=batch_size, device=device)
|
||||
|
||||
|
||||
def print_scores(preds, golds):
|
||||
_preds = [label for sublist in preds for label in sublist]
|
||||
_golds = [label for sublist in golds for label in sublist]
|
||||
target_names = ['not_del', 'del']
|
||||
print(classification_report(_golds, _preds, target_names=target_names, digits=5))
|
||||
|
||||
|
||||
def output_results(tokens, preds, golds, path='./result/sentcomp'):
|
||||
with open(path+'.original.txt', mode='w') as w, \
|
||||
open(path+'.gold.txt', mode='w') as w_gold, \
|
||||
open(path+'.pred.txt', mode='w') as w_pred:
|
||||
|
||||
for _tokens, _golds, _preds in zip(tokens, golds, preds):
|
||||
for token, gold, pred in zip(_tokens, _golds, _preds):
|
||||
w.write(token + ' ')
|
||||
if gold == 0:
|
||||
w_gold.write(token + ' ')
|
||||
# 0 -> keep, 1 -> delete
|
||||
if pred == 0:
|
||||
w_pred.write(token + ' ')
|
||||
w.write('\n')
|
||||
w_gold.write('\n')
|
||||
w_pred.write('\n')
|
218
joint_sentence_compression_model/libs/utils.py
Normal file
218
joint_sentence_compression_model/libs/utils.py
Normal file
|
@ -0,0 +1,218 @@
|
|||
import json
|
||||
import logging
|
||||
import math
|
||||
import os
|
||||
import random
|
||||
import re
|
||||
import time
|
||||
|
||||
import matplotlib.pyplot as plt
|
||||
import matplotlib.ticker as ticker
|
||||
from nltk.translate.bleu_score import sentence_bleu
|
||||
import numpy as np
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
|
||||
import config
|
||||
|
||||
|
||||
plt.switch_backend("agg")
|
||||
|
||||
|
||||
def load_glove(vocabulary):
|
||||
logger = logging.getLogger(f"{__name__}.load_glove")
|
||||
logger.info("loading embeddings")
|
||||
try:
|
||||
with open(f"glove.cache") as h:
|
||||
cache = json.load(h)
|
||||
except:
|
||||
logger.info("cache doesn't exist")
|
||||
cache = {}
|
||||
cache[config.PAD] = [0] * 300
|
||||
cache[config.SOS] = [0] * 300
|
||||
cache[config.EOS] = [0] * 300
|
||||
cache[config.UNK] = [0] * 300
|
||||
cache[config.NOFIX] = [0] * 300
|
||||
else:
|
||||
logger.info("cache found")
|
||||
|
||||
cache_miss = False
|
||||
|
||||
if not set(vocabulary) <= set(cache):
|
||||
cache_miss = True
|
||||
logger.warn("cache miss, loading full embeddings")
|
||||
data = {}
|
||||
with open("glove.840B.300d.txt") as h:
|
||||
for line in h:
|
||||
word, *emb = line.strip().split()
|
||||
try:
|
||||
data[word] = [float(x) for x in emb]
|
||||
except:
|
||||
continue
|
||||
logger.info("finished loading full embeddings")
|
||||
for word in vocabulary:
|
||||
try:
|
||||
cache[word] = data[word]
|
||||
except KeyError:
|
||||
cache[word] = [0] * 300
|
||||
logger.info("cache updated")
|
||||
|
||||
embeddings = []
|
||||
for word in vocabulary:
|
||||
embeddings.append(torch.tensor(cache[word], dtype=torch.float32))
|
||||
embeddings = torch.stack(embeddings)
|
||||
|
||||
if cache_miss:
|
||||
with open(f"glove.cache", "w") as h:
|
||||
json.dump(cache, h)
|
||||
logger.info("cache saved")
|
||||
|
||||
return embeddings
|
||||
|
||||
|
||||
def tokenize(s):
|
||||
s = s.lower().strip()
|
||||
s = re.sub(r"([.!?])", r" \1", s)
|
||||
s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
|
||||
s = s.split(" ")
|
||||
return s
|
||||
|
||||
|
||||
def indices_from_sentence(word2index, sentence, unknown_threshold):
|
||||
if unknown_threshold:
|
||||
return [
|
||||
word2index.get(
|
||||
word if random.random() > unknown_threshold else config.UNK,
|
||||
word2index[config.UNK],
|
||||
)
|
||||
for word in sentence
|
||||
]
|
||||
else:
|
||||
return [
|
||||
word2index.get(word, word2index[config.UNK]) for word in sentence
|
||||
]
|
||||
|
||||
|
||||
def tensor_from_sentence(word2index, sentence, unknown_threshold):
|
||||
indices = indices_from_sentence(word2index, sentence, unknown_threshold)
|
||||
return torch.tensor(indices, dtype=torch.long, device=config.DEV)
|
||||
|
||||
|
||||
def tensors_from_pair(word2index, pair, shuffle, unknown_threshold):
|
||||
tensors = [
|
||||
tensor_from_sentence(word2index, pair[0], unknown_threshold),
|
||||
tensor_from_sentence(word2index, pair[1], unknown_threshold),
|
||||
]
|
||||
if shuffle:
|
||||
random.shuffle(tensors)
|
||||
return tensors
|
||||
|
||||
|
||||
def bleu(reference, hypothesis, n=4):
|
||||
if n < 1:
|
||||
return 0
|
||||
weights = [1/n]*n
|
||||
return sentence_bleu([reference], hypothesis, weights)
|
||||
|
||||
|
||||
def pair_iter(pairs, word2index, shuffle=False, shuffle_pairs=False, unknown_threshold=0.00):
|
||||
if shuffle:
|
||||
pairs = pairs.copy()
|
||||
random.shuffle(pairs)
|
||||
for pair in pairs:
|
||||
tensor1, tensor2 = tensors_from_pair(word2index, (pair[0], pair[1]), shuffle_pairs, unknown_threshold)
|
||||
yield (tensor1,), (tensor2,)
|
||||
|
||||
|
||||
def sent_iter(sents, word2index, batch_size, unknown_threshold=0.00):
|
||||
for i in range(len(sents)//batch_size+1):
|
||||
raw_sents = [x[0] for x in sents[i*batch_size:i*batch_size+batch_size]]
|
||||
_sents = [tensor_from_sentence(word2index, sent, unknown_threshold) for sent, target in sents[i*batch_size:i*batch_size+batch_size]]
|
||||
_targets = [torch.tensor(target, dtype=torch.long).to(config.DEV) for sent, target in sents[i*batch_size:i*batch_size+batch_size]]
|
||||
if raw_sents and _sents and _targets:
|
||||
yield(raw_sents, _sents, _targets)
|
||||
|
||||
|
||||
def batch_iter(pairs, word2index, batch_size, shuffle=False, unknown_threshold=0.00):
|
||||
for i in range(len(pairs) // batch_size):
|
||||
batch = pairs[i : i + batch_size]
|
||||
if len(batch) != batch_size:
|
||||
continue
|
||||
batch_tensors = [
|
||||
tensors_from_pair(word2index, (pair[0], pair[1]), shuffle, unknown_threshold)
|
||||
for pair in batch
|
||||
]
|
||||
|
||||
tensors1, tensors2 = zip(*batch_tensors)
|
||||
|
||||
yield tensors1, tensors2
|
||||
|
||||
|
||||
def asMinutes(s):
|
||||
m = math.floor(s / 60)
|
||||
s -= m * 60
|
||||
return "%dm %ds" % (m, s)
|
||||
|
||||
|
||||
def timeSince(since, percent):
|
||||
now = time.time()
|
||||
s = now - since
|
||||
es = s / (percent)
|
||||
rs = es - s
|
||||
return "%s (- %s)" % (asMinutes(s), asMinutes(rs))
|
||||
|
||||
|
||||
def showPlot(points):
|
||||
plt.figure()
|
||||
fig, ax = plt.subplots()
|
||||
# this locator puts ticks at regular intervals
|
||||
loc = ticker.MultipleLocator(base=0.2)
|
||||
ax.yaxis.set_major_locator(loc)
|
||||
plt.plot(points)
|
||||
|
||||
|
||||
def showAttention(input_sentence, output_words, attentions):
|
||||
# Set up figure with colorbar
|
||||
fig = plt.figure()
|
||||
ax = fig.add_subplot(111)
|
||||
cax = ax.matshow(attentions.numpy(), cmap="bone")
|
||||
fig.colorbar(cax)
|
||||
|
||||
# Set up axes
|
||||
ax.set_xticklabels([""] + input_sentence.split(" ") + ["<__EOS__>"], rotation=90)
|
||||
ax.set_yticklabels([""] + output_words)
|
||||
|
||||
# Show label at every tick
|
||||
ax.xaxis.set_major_locator(ticker.MultipleLocator(1))
|
||||
ax.yaxis.set_major_locator(ticker.MultipleLocator(1))
|
||||
|
||||
plt.show()
|
||||
|
||||
|
||||
def evaluateAndShowAttention(input_sentence):
|
||||
output_words, attentions = evaluate(encoder1, attn_decoder1, input_sentence)
|
||||
print("input =", input_sentence)
|
||||
print("output =", " ".join(output_words))
|
||||
showAttention(input_sentence, output_words, attentions)
|
||||
|
||||
|
||||
def save_model(model, word2index, path):
|
||||
if not path.endswith(".tar"):
|
||||
path += ".tar"
|
||||
torch.save(
|
||||
{"weights": model.state_dict(), "word2index": word2index},
|
||||
path,
|
||||
)
|
||||
|
||||
|
||||
def load_model(path):
|
||||
checkpoint = torch.load(path)
|
||||
return checkpoint["weights"], checkpoint["word2index"]
|
||||
|
||||
|
||||
def extend_vocabulary(word2index, langs):
|
||||
for lang in langs:
|
||||
for word in lang.word2index:
|
||||
if word not in word2index:
|
||||
word2index[word] = len(word2index)
|
||||
return word2index
|
Loading…
Add table
Add a link
Reference in a new issue