Add NLP task models

This commit is contained in:
Ekta Sood 2020-12-08 21:10:52 +01:00
parent d8beb17dfb
commit 69f6de0ace
46 changed files with 4976 additions and 0 deletions

View file

View file

@ -0,0 +1,416 @@
import logging
import config
def tokenize(sent):
return sent.split(" ")
class Lang:
"""Represents the vocabulary
"""
def __init__(self, name):
self.name = name
self.word2index = {
config.PAD: 0,
config.UNK: 1,
config.NOFIX: 2,
config.SOS: 3,
config.EOS: 4,
}
self.word2count = {}
self.index2word = {
0: config.PAD,
1: config.UNK,
2: config.NOFIX,
3: config.SOS,
4: config.EOS,
}
self.n_words = 5
def add_sentence(self, sentence):
assert isinstance(
sentence, (list, tuple)
), "input to add_sentence must be tokenized"
for word in sentence:
self.add_word(word)
def add_word(self, word):
if word not in self.word2index:
self.word2index[word] = self.n_words
self.word2count[word] = 1
self.index2word[self.n_words] = word
self.n_words += 1
else:
self.word2count[word] += 1
def __add__(self, other):
"""Returns a new Lang object containing the vocabulary from this and
the other Lang object
"""
new_lang = Lang(f"{self.name}_{other.name}")
# Add vocabulary from both Langs
for word in self.word2count.keys():
new_lang.add_word(word)
for word in other.word2count.keys():
new_lang.add_word(word)
# Fix the counts on the new one
for word in new_lang.word2count.keys():
new_lang.word2count[word] = self.word2count.get(
word, 0
) + other.word2count.get(word, 0)
return new_lang
def load_wiki(split):
"""Load the Wiki from PAWs"""
logger = logging.getLogger(f"{__name__}.load_wiki")
lang = Lang("wiki")
if split == "train":
path = config.wiki_train_path
elif split == "val":
path = config.wiki_dev_path
elif split == "test":
path = config.wiki_test_path
logger.info("loading %s from %s" % (split, path))
pairs = []
with open(path) as handle:
# skip header
handle.readline()
for line in handle:
_, sent1, sent2, rating = line.strip().split("\t")
if rating == "0":
continue
sent1 = tokenize(sent1)
sent2 = tokenize(sent2)
lang.add_sentence(sent1)
lang.add_sentence(sent2)
# pairs.append([sent1, sent2, rating])
pairs.append([sent1, sent2])
# MS makes the vocab for paraphrase the same
return pairs, lang
def load_qqp_paws(split):
"""Load the QQP from PAWs"""
logger = logging.getLogger(f"{__name__}.load_qqp_paws")
lang = Lang("qqp_paws")
if split == "train":
path = config.qqp_paws_train_path
elif split == "val":
path = config.qqp_paws_dev_path
elif split == "test":
path = config.qqp_paws_test_path
logger.info("loading %s from %s" % (split, path))
pairs = []
with open(path) as handle:
# skip header
handle.readline()
for line in handle:
_, sent1, sent2, rating = line.strip().split("\t")
if rating == "0":
continue
sent1 = tokenize(sent1)
sent2 = tokenize(sent2)
lang.add_sentence(sent1)
lang.add_sentence(sent2)
# pairs.append([sent1, sent2, rating])
pairs.append([sent1, sent2])
# MS makes the vocab for paraphrase the same
return pairs, lang
def load_qqp(split):
"""Load the QQP from Original"""
logger = logging.getLogger(f"{__name__}.load_qqp")
lang = Lang("qqp")
if split == "train":
path = config.qqp_train_path
elif split == "val":
path = config.qqp_dev_path
elif split == "test":
path = config.qqp_test_path
logger.info("loading %s from %s" % (split, path))
pairs = []
with open(path) as handle:
# skip header
handle.readline()
for line in handle:
rating, sent1, sent2, _ = line.strip().split("\t")
if rating == "0":
continue
sent1 = tokenize(sent1)
sent2 = tokenize(sent2)
lang.add_sentence(sent1)
lang.add_sentence(sent2)
# pairs.append([sent1, sent2, rating])
pairs.append([sent1, sent2])
# MS makes the vocab for paraphrase the same
return pairs, lang
def load_qqp_kag(split):
"""Load the QQP from Kaggle""" #not original right now, expriemnting with kaggle 100K, 3K, 30K split
logger = logging.getLogger(f"{__name__}.load_qqp_kag")
lang = Lang("qqp_kag")
if split == "train":
path = config.qqp_kag_train_path
elif split == "val":
path = config.qqp_kag_dev_path
elif split == "test":
path = config.qqp_kag_test_path
logger.info("loading %s from %s" % (split, path))
pairs = []
with open(path) as handle:
# skip header
handle.readline()
for line in handle: #when reading the kag version we do not have 4 fields, but rather 3
rating, sent1, sent2 = line.strip().split("\t")
if rating == "0":
continue
sent1 = tokenize(sent1)
sent2 = tokenize(sent2)
lang.add_sentence(sent1)
lang.add_sentence(sent2)
# pairs.append([sent1, sent2, rating])
pairs.append([sent1, sent2])
# MS makes the vocab for paraphrase the same
return pairs, lang
def load_msrpc(split):
"""Load the Microsoft Research Paraphrase Corpus (MSRPC)"""
logger = logging.getLogger(f"{__name__}.load_msrpc")
lang = Lang("msrpc")
if split == "train":
path = config.msrpc_train_path
elif split == "val":
path = config.msrpc_dev_path
elif split == "test":
path = config.msrpc_test_path
logger.info("loading %s from %s" % (split, path))
pairs = []
with open(path) as handle:
# skip header
handle.readline()
for line in handle:
rating, _, _, sent1, sent2 = line.strip().split("\t")
if rating == "0":
continue
sent1 = tokenize(sent1)
sent2 = tokenize(sent2)
lang.add_sentence(sent1)
lang.add_sentence(sent2)
# pairs.append([sent1, sent2, rating])
pairs.append([sent1, sent2])
# return src_lang, dst_lang, pairs
# MS makes the vocab for paraphrase the same
return pairs, lang
def load_sentiment(split):
"""Load the Sentiment Kaggle Comp Dataset"""
logger = logging.getLogger(f"{__name__}.load_sentiment")
lang = Lang("sentiment")
if split == "train":
path = config.sentiment_train_path
elif split == "val":
path = config.sentiment_dev_path
elif split == "test":
path = config.sentiment_test_path
logger.info("loading %s from %s" % (split, path))
pairs = []
with open(path) as handle:
# skip header
handle.readline()
for line in handle:
_, _, sent1, sent2 = line.strip().split("\t")
sent1 = tokenize(sent1)
sent2 = tokenize(sent2)
lang.add_sentence(sent1)
lang.add_sentence(sent2)
# pairs.append([sent1, sent2, rating])
pairs.append([sent1, sent2])
return pairs, lang
def load_tamil(split):
"""Load the En to Tamil dataset, current SOTA ~13 bleu"""
logger = logging.getLogger(f"{__name__}.load_tamil")
lang = Lang("tamil")
if split == "train":
path = config.tamil_train_path
elif split == "val":
path = config.tamil_dev_path
elif split == "test":
path = config.tamil_test_path
logger.info("loading %s from %s" % (split, path))
pairs = []
with open(path) as handle:
handle.readline()
for line in handle:
sent1, sent2 = line.strip().split("\t")
#if rating == "0":
# continue
sent1 = tokenize(sent1)
#I dunno how to tokenize tamil.....?
sent2 = tokenize(sent2)
lang.add_sentence(sent1)
lang.add_sentence(sent2)
pairs.append([sent1, sent2])
return pairs, lang
def load_compression(split):
"""Load the Google Sentence Compression Dataset"""
logger = logging.getLogger(f"{__name__}.load_compression")
lang = Lang("compression")
if split == "train":
path = config.compression_train_path
elif split == "val":
path = config.compression_dev_path
elif split == "test":
path = config.compression_test_path
logger.info("loading %s from %s" % (split, path))
pairs = []
with open(path) as handle:
handle.readline()
for line in handle:
sent1, sent2 = line.strip().split("\t")
sent1 = tokenize(sent1)
sent2 = tokenize(sent2)
# print(len(sent1), sent1)
# print(len(sent2), sent2)
# print()
lang.add_sentence(sent1)
lang.add_sentence(sent2)
pairs.append([sent1, sent2])
return pairs, lang
def load_stanford(split):
"""Load the Stanford Sentiment Dataset phrases"""
logger = logging.getLogger(f"{__name__}.load_stanford")
lang = Lang("stanford")
if split == "train":
path = config.stanford_train_path
elif split == "val":
path = config.stanford_dev_path
elif split == "test":
path = config.stanford_test_path
logger.info("loading %s from %s" % (split, path))
pairs = []
with open(path) as handle:
# skip header
#handle.readline()
for line in handle:
_, _, sent1, sent2 = line.strip().split("\t")
sent1 = tokenize(sent1)
sent2 = tokenize(sent2)
lang.add_sentence(sent1)
lang.add_sentence(sent2)
# pairs.append([sent1, sent2, rating])
pairs.append([sent1, sent2])
return pairs, lang
def load_stanford_sent(split):
"""Load the Stanford Sentiment Dataset sentences"""
logger = logging.getLogger(f"{__name__}.load_stanford_sent")
lang = Lang("stanford_sent")
if split == "train":
path = config.stanford_sent_train_path
elif split == "val":
path = config.stanford_sent_dev_path
elif split == "test":
path = config.stanford_sent_test_path
logger.info("loading %s from %s" % (split, path))
pairs = []
with open(path) as handle:
# skip header
#handle.readline()
for line in handle:
_, _, sent1, sent2 = line.strip().split("\t")
sent1 = tokenize(sent1)
sent2 = tokenize(sent2)
lang.add_sentence(sent1)
lang.add_sentence(sent2)
# pairs.append([sent1, sent2, rating])
pairs.append([sent1, sent2])
return pairs, lang

View file

@ -0,0 +1 @@
from .main import *

View file

@ -0,0 +1,131 @@
from collections import OrderedDict
import logging
import sys
from .self_attention import Transformer
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn.utils.rnn import pack_sequence, pack_padded_sequence, pad_packed_sequence
def random_embedding(vocab_size, embedding_dim):
pretrain_emb = np.empty([vocab_size, embedding_dim])
scale = np.sqrt(3.0 / embedding_dim)
for index in range(vocab_size):
pretrain_emb[index, :] = np.random.uniform(-scale, scale, [1, embedding_dim])
return pretrain_emb
def neg_log_likelihood_loss(outputs, batch_label, batch_size, seq_len):
outputs = outputs.view(batch_size * seq_len, -1)
score = F.log_softmax(outputs, 1)
loss = nn.NLLLoss(ignore_index=0, size_average=False)(
score, batch_label.view(batch_size * seq_len)
)
loss = loss / batch_size
_, tag_seq = torch.max(score, 1)
tag_seq = tag_seq.view(batch_size, seq_len)
# print(score[0], tag_seq[0])
return loss, tag_seq
def mse_loss(outputs, batch_label, batch_size, seq_len, word_seq_length):
# score = torch.nn.functional.softmax(outputs, 1)
score = torch.sigmoid(outputs)
mask = torch.zeros_like(score)
for i, v in enumerate(word_seq_length):
mask[i, 0:v] = 1
score = score * mask
loss = nn.MSELoss(reduction="sum")(
score.view(batch_size, seq_len), batch_label.view(batch_size, seq_len)
)
loss = loss / batch_size
return loss, score.view(batch_size, seq_len)
class Network(nn.Module):
def __init__(
self,
embedding_type,
vocab_size,
embedding_dim,
dropout,
hidden_dim,
embeddings=None,
attention=True,
):
super().__init__()
self.logger = logging.getLogger(f"{__name__}")
prelayers = OrderedDict()
postlayers = OrderedDict()
if embedding_type in ("w2v", "glove"):
if embeddings is not None:
prelayers["embedding_layer"] = nn.Embedding.from_pretrained(embeddings)
else:
prelayers["embedding_layer"] = nn.Embedding(vocab_size, embedding_dim)
prelayers["embedding_dropout_layer"] = nn.Dropout(dropout)
embedding_dim = 300
elif embedding_type == "bert":
embedding_dim = 768
self.lstm = BiLSTM(embedding_dim, hidden_dim // 2, num_layers=1)
postlayers["lstm_dropout_layer"] = nn.Dropout(dropout)
if attention:
# increased compl with 1024D, and 16,16: for no att and att experiments
# before: for the initial att and pretraining: heads 4 and layers 4, 128D
# then was 128 D with heads 4 layer 1 = results for all IUI
###postlayers["position_encodings"] = PositionalEncoding(hidden_dim)
postlayers["attention_layer"] = Transformer(
d_model=hidden_dim, n_heads=4, n_layers=1
)
postlayers["ff_layer"] = nn.Linear(hidden_dim, hidden_dim // 2)
postlayers["ff_activation"] = nn.ReLU()
postlayers["output_layer"] = nn.Linear(hidden_dim // 2, 1)
self.logger.info(f"prelayers: {prelayers.keys()}")
self.logger.info(f"postlayers: {postlayers.keys()}")
self.pre = nn.Sequential(prelayers)
self.post = nn.Sequential(postlayers)
def forward(self, x, word_seq_length):
x = self.pre(x)
x = self.lstm(x, word_seq_length)
#MS pritning fix model params
#for p in self.parameters():
# print(p.data)
# break
return self.post(x.transpose(1, 0))
class BiLSTM(nn.Module):
def __init__(self, embedding_dim, lstm_hidden, num_layers):
super().__init__()
self.net = nn.LSTM(
input_size=embedding_dim,
hidden_size=lstm_hidden,
num_layers=num_layers,
batch_first=True,
bidirectional=True,
)
def forward(self, x, word_seq_length):
packed_words = pack_padded_sequence(x, word_seq_length, True, False)
lstm_out, hidden = self.net(packed_words)
lstm_out, _ = pad_packed_sequence(lstm_out)
return lstm_out

View file

@ -0,0 +1,131 @@
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import math
class PositionalEncoding(nn.Module):
def __init__(self, d_hid, n_position=200):
super(PositionalEncoding, self).__init__()
# Not a parameter
self.register_buffer('pos_table', self._get_sinusoid_encoding_table(n_position, d_hid))
def _get_sinusoid_encoding_table(self, n_position, d_hid):
''' Sinusoid position encoding table '''
# TODO: make it with torch instead of numpy
def get_position_angle_vec(position):
return [position / np.power(10000, 2 * (hid_j // 2) / d_hid) for hid_j in range(d_hid)]
sinusoid_table = np.array([get_position_angle_vec(pos_i) for pos_i in range(n_position)])
sinusoid_table[:, 0::2] = np.sin(sinusoid_table[:, 0::2]) # dim 2i
sinusoid_table[:, 1::2] = np.cos(sinusoid_table[:, 1::2]) # dim 2i+1
return torch.FloatTensor(sinusoid_table).unsqueeze(0)
def forward(self, x):
return x + self.pos_table[:, :x.size(1)].clone().detach()
class AttentionLayer(nn.Module):
def __init__(self):
super(AttentionLayer, self).__init__()
def forward(self, Q, K, V):
# Q: float32:[batch_size, n_queries, d_k]
# K: float32:[batch_size, n_keys, d_k]
# V: float32:[batch_size, n_keys, d_v]
dk = K.shape[-1]
dv = V.shape[-1]
KT = torch.transpose(K, -1, -2)
weight_logits = torch.bmm(Q, KT) / math.sqrt(dk)
# weight_logits: float32[batch_size, n_queries, n_keys]
weights = F.softmax(weight_logits, dim=-1)
# weight: float32[batch_size, n_queries, n_keys]
return torch.bmm(weights, V)
# return float32[batch_size, n_queries, dv]
class MultiHeadedSelfAttentionLayer(nn.Module):
def __init__(self, d_model, n_heads):
super(MultiHeadedSelfAttentionLayer, self).__init__()
self.d_model = d_model
self.n_heads = n_heads
print('{} {}'.format(d_model, n_heads))
assert d_model % n_heads == 0
self.d_k = d_model // n_heads
self.d_v = self.d_k
self.attention_layer = AttentionLayer()
self.W_Qs = nn.ModuleList([
nn.Linear(d_model, self.d_k, bias=False)
for _ in range(n_heads)
])
self.W_Ks = nn.ModuleList([
nn.Linear(d_model, self.d_k, bias=False)
for _ in range(n_heads)
])
self.W_Vs = nn.ModuleList([
nn.Linear(d_model, self.d_v, bias=False)
for _ in range(n_heads)
])
self.W_O = nn.Linear(d_model, d_model, bias=False)
def forward(self, x):
# x:float32[batch_size, sequence_length, self.d_model]
head_outputs = []
for W_Q, W_K, W_V in zip(self.W_Qs, self.W_Ks, self.W_Vs):
Q = W_Q(x)
# Q float32:[batch_size, sequence_length, self.d_k]
K = W_K(x)
# Q float32:[batch_size, sequence_length, self.d_k]
V = W_V(x)
# Q float32:[batch_size, sequence_length, self.d_v]
head_output = self.attention_layer(Q, K, V)
# float32:[batch_size, sequence_length, self.d_v]
head_outputs.append(head_output)
concatenated = torch.cat(head_outputs, dim=-1)
# concatenated float32:[batch_size, sequence_length, self.d_model]
out = self.W_O(concatenated)
# out float32:[batch_size, sequence_length, self.d_model]
return out
class Feedforward(nn.Module):
def __init__(self, d_model):
super(Feedforward, self).__init__()
self.d_model = d_model
self.W1 = nn.Linear(d_model, d_model)
self.W2 = nn.Linear(d_model, d_model)
def forward(self, x):
# x: float32[batch_size, sequence_length, d_model]
return self.W2(torch.relu(self.W1(x)))
class Transformer(nn.Module):
def __init__(self, d_model, n_heads, n_layers):
super(Transformer, self).__init__()
self.d_model = d_model
self.n_heads = n_heads
self.n_layers = n_layers
self.attention_layers = nn.ModuleList([
MultiHeadedSelfAttentionLayer(d_model, n_heads)
for _ in range(n_layers)
])
self.ffs = nn.ModuleList([
Feedforward(d_model)
for _ in range(n_layers)
])
def forward(self, x):
# x: float32[batch_size, sequence_length, self.d_model]
for attention_layer, ff in zip(self.attention_layers, self.ffs):
attention_out = attention_layer(x)
# attention_out: float32[batch_size, sequence_length, self.d_model]
x = F.layer_norm(x + attention_out, x.shape[2:])
ff_out = ff(x)
# ff_out: float32[batch_size, sequence_length, self.d_model]
x = F.layer_norm(x + ff_out, x.shape[2:])
return x

View file

@ -0,0 +1 @@
from .main import *

View file

@ -0,0 +1,86 @@
import json
import math
import os
import random
import time
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
class EncoderRNN(nn.Module):
def __init__(self, input_size, hidden_size, embeddings):
super(EncoderRNN, self).__init__()
self.hidden_size = hidden_size
self.embedding = nn.Embedding.from_pretrained(embeddings)
self.gru = nn.GRU(input_size, hidden_size)
def forward(self, input, hidden):
embedded = self.embedding(input).view(1, 1, -1)
output = embedded
output, hidden = self.gru(output, hidden)
return output, hidden
def initHidden(self):
return torch.zeros(1, 1, self.hidden_size)
class AttnDecoderRNN(nn.Module):
def __init__(
self,
input_size,
hidden_size,
output_size,
embeddings,
dropout_p,
max_length,
):
super(AttnDecoderRNN, self).__init__()
self.input_size = input_size
self.hidden_size = hidden_size
self.output_size = output_size
self.dropout_p = dropout_p
self.max_length = max_length
self.embedding = nn.Embedding.from_pretrained(embeddings) #for paragen
#self.embedding = nn.Embedding(len(embeddings), 300) #for NMT with tamil, trying wiht senitment too
self.attn = nn.Linear(self.input_size + self.hidden_size, self.max_length)
self.attn_combine = nn.Linear(
self.input_size + self.hidden_size, self.hidden_size
)
self.dropout = nn.Dropout(self.dropout_p)
self.gru = nn.GRU(self.hidden_size, self.hidden_size)
self.out = nn.Linear(self.hidden_size, self.output_size)
def forward(self, input, hidden, encoder_outputs, fixations):
embedded = self.embedding(input).view(1, 1, -1)
embedded = self.dropout(embedded)
attn_weights = F.softmax(
self.attn(torch.cat((embedded[0], hidden[0]), 1)), dim=1
)
attn_weights = attn_weights * torch.nn.ConstantPad1d((0, attn_weights.shape[-1] - fixations.shape[-2]), 0)(fixations.squeeze().unsqueeze(0))
# attn_weights = torch.softmax(attn_weights * torch.nn.ConstantPad1d((0, attn_weights.shape[-1] - fixations.shape[-2]), 0)(fixations.squeeze().unsqueeze(0)), dim=1)
attn_applied = torch.bmm(
attn_weights.unsqueeze(0), encoder_outputs.unsqueeze(0)
)
output = torch.cat((embedded[0], attn_applied[0]), 1)
output = self.attn_combine(output).unsqueeze(0)
output = F.relu(output)
output, hidden = self.gru(output, hidden)
# output = F.log_softmax(self.out(output[0]), dim=1)
output = self.out(output[0])
# output = F.log_softmax(output, dim=1)
return output, hidden, attn_weights

View file

@ -0,0 +1,225 @@
import json
import logging
import math
import os
import random
import re
import time
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
from nltk.translate.bleu_score import sentence_bleu
import numpy as np
import torch
import torch.nn as nn
import config
plt.switch_backend("agg")
def load_glove(vocabulary):
logger = logging.getLogger(f"{__name__}.load_glove")
logger.info("loading embeddings")
try:
with open(f"glove.cache") as h:
cache = json.load(h)
except:
logger.info("cache doesn't exist")
cache = {}
cache[config.PAD] = [0] * 300
cache[config.SOS] = [0] * 300
cache[config.EOS] = [0] * 300
cache[config.UNK] = [0] * 300
cache[config.NOFIX] = [0] * 300
else:
logger.info("cache found")
cache_miss = False
if not set(vocabulary) <= set(cache):
cache_miss = True
logger.warn("cache miss, loading full embeddings")
data = {}
with open("glove.840B.300d.txt") as h:
for line in h:
word, *emb = line.strip().split()
try:
data[word] = [float(x) for x in emb]
except:
continue
logger.info("finished loading full embeddings")
for word in vocabulary:
try:
cache[word] = data[word]
except KeyError:
cache[word] = [0] * 300
logger.info("cache updated")
embeddings = []
for word in vocabulary:
embeddings.append(torch.tensor(cache[word], dtype=torch.float32))
embeddings = torch.stack(embeddings)
if cache_miss:
with open(f"glove.cache", "w") as h:
json.dump(cache, h)
logger.info("cache saved")
return embeddings
def tokenize(s):
s = s.lower().strip()
s = re.sub(r"([.!?])", r" \1", s)
s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
s = s.split(" ")
return s
def indices_from_sentence(word2index, sentence, unknown_threshold):
if unknown_threshold:
return [
word2index.get(
word if random.random() > unknown_threshold else config.UNK,
word2index[config.UNK],
)
for word in sentence
]
else:
return [
word2index.get(word, word2index[config.UNK]) for word in sentence
]
def tensor_from_sentence(word2index, sentence, unknown_threshold):
# indices = [config.SOS]
indices = indices_from_sentence(word2index, sentence, unknown_threshold)
indices.append(word2index[config.EOS])
return torch.tensor(indices, dtype=torch.long, device=config.DEV)
def tensors_from_pair(word2index, pair, shuffle, unknown_threshold):
tensors = [
tensor_from_sentence(word2index, pair[0], unknown_threshold),
tensor_from_sentence(word2index, pair[1], unknown_threshold),
]
if shuffle:
random.shuffle(tensors)
return tensors
def bleu(reference, hypothesis, n=4): #not sure if this actually changes the n gram
if n < 1:
return 0
weights = [1/n]*n
return sentence_bleu([reference], hypothesis, weights)
def pair_iter(pairs, word2index, shuffle=False, shuffle_pairs=False, unknown_threshold=0.00):
if shuffle:
pairs = pairs.copy()
random.shuffle(pairs)
for pair in pairs:
tensor1, tensor2 = tensors_from_pair(word2index, (pair[0], pair[1]), shuffle_pairs, unknown_threshold)
yield (tensor1,), (tensor2,)
def sent_iter(sents, word2index, unknown_threshold=0.00):
for sent in sents:
tensor = tensor_from_sentence(word2index, sent, unknown_threshold)
yield (tensor,)
def batch_iter(pairs, word2index, batch_size, shuffle=False, unknown_threshold=0.00):
for i in range(len(pairs) // batch_size):
batch = pairs[i : i + batch_size]
if len(batch) != batch_size:
continue
batch_tensors = [
tensors_from_pair(word2index, (pair[0], pair[1]), shuffle, unknown_threshold)
for pair in batch
]
tensors1, tensors2 = zip(*batch_tensors)
# targets = torch.tensor(targets, dtype=torch.long, device=config.DEV)
# tensors1_lengths = [len(t) for t in tensors1]
# tensors2_lengths = [len(t) for t in tensors2]
# tensors1 = nn.utils.rnn.pack_sequence(tensors1, enforce_sorted=False)
# tensors2 = nn.utils.rnn.pack_sequence(tensors2, enforce_sorted=False)
yield tensors1, tensors2
def asMinutes(s):
m = math.floor(s / 60)
s -= m * 60
return "%dm %ds" % (m, s)
def timeSince(since, percent):
now = time.time()
s = now - since
es = s / (percent)
rs = es - s
return "%s (- %s)" % (asMinutes(s), asMinutes(rs))
def showPlot(points):
plt.figure()
fig, ax = plt.subplots()
# this locator puts ticks at regular intervals
loc = ticker.MultipleLocator(base=0.2)
ax.yaxis.set_major_locator(loc)
plt.plot(points)
def showAttention(input_sentence, output_words, attentions):
# Set up figure with colorbar
fig = plt.figure()
ax = fig.add_subplot(111)
cax = ax.matshow(attentions.numpy(), cmap="bone")
fig.colorbar(cax)
# Set up axes
ax.set_xticklabels([""] + input_sentence.split(" ") + ["<__EOS__>"], rotation=90)
ax.set_yticklabels([""] + output_words)
# Show label at every tick
ax.xaxis.set_major_locator(ticker.MultipleLocator(1))
ax.yaxis.set_major_locator(ticker.MultipleLocator(1))
plt.show()
def evaluateAndShowAttention(input_sentence):
output_words, attentions = evaluate(encoder1, attn_decoder1, input_sentence)
print("input =", input_sentence)
print("output =", " ".join(output_words))
showAttention(input_sentence, output_words, attentions)
def save_model(model, word2index, path):
if not path.endswith(".tar"):
path += ".tar"
torch.save(
{"weights": model.state_dict(), "word2index": word2index},
path,
)
def load_model(path):
checkpoint = torch.load(path)
return checkpoint["weights"], checkpoint["word2index"]
def extend_vocabulary(word2index, langs):
for lang in langs:
for word in lang.word2index:
if word not in word2index:
word2index[word] = len(word2index)
return word2index