Add NLP task models

This commit is contained in:
Ekta Sood 2020-12-08 21:10:52 +01:00
parent d8beb17dfb
commit 69f6de0ace
46 changed files with 4976 additions and 0 deletions

428
joint_paraphrase_model/.gitignore vendored Normal file
View file

@ -0,0 +1,428 @@
# Created by https://www.toptal.com/developers/gitignore/api/python,latex
# Edit at https://www.toptal.com/developers/gitignore?templates=python,latex
### LaTeX ###
## Core latex/pdflatex auxiliary files:
*.aux
*.lof
*.log
*.lot
*.fls
*.out
*.toc
*.fmt
*.fot
*.cb
*.cb2
.*.lb
## Intermediate documents:
*.dvi
*.xdv
*-converted-to.*
# these rules might exclude image files for figures etc.
# *.ps
# *.eps
# *.pdf
## Generated if empty string is given at "Please type another file name for output:"
.pdf
## Bibliography auxiliary files (bibtex/biblatex/biber):
*.bbl
*.bcf
*.blg
*-blx.aux
*-blx.bib
*.run.xml
## Build tool auxiliary files:
*.fdb_latexmk
*.synctex
*.synctex(busy)
*.synctex.gz
*.synctex.gz(busy)
*.pdfsync
## Build tool directories for auxiliary files
# latexrun
latex.out/
## Auxiliary and intermediate files from other packages:
# algorithms
*.alg
*.loa
# achemso
acs-*.bib
# amsthm
*.thm
# beamer
*.nav
*.pre
*.snm
*.vrb
# changes
*.soc
# comment
*.cut
# cprotect
*.cpt
# elsarticle (documentclass of Elsevier journals)
*.spl
# endnotes
*.ent
# fixme
*.lox
# feynmf/feynmp
*.mf
*.mp
*.t[1-9]
*.t[1-9][0-9]
*.tfm
#(r)(e)ledmac/(r)(e)ledpar
*.end
*.?end
*.[1-9]
*.[1-9][0-9]
*.[1-9][0-9][0-9]
*.[1-9]R
*.[1-9][0-9]R
*.[1-9][0-9][0-9]R
*.eledsec[1-9]
*.eledsec[1-9]R
*.eledsec[1-9][0-9]
*.eledsec[1-9][0-9]R
*.eledsec[1-9][0-9][0-9]
*.eledsec[1-9][0-9][0-9]R
# glossaries
*.acn
*.acr
*.glg
*.glo
*.gls
*.glsdefs
*.lzo
*.lzs
# uncomment this for glossaries-extra (will ignore makeindex's style files!)
# *.ist
# gnuplottex
*-gnuplottex-*
# gregoriotex
*.gaux
*.gtex
# htlatex
*.4ct
*.4tc
*.idv
*.lg
*.trc
*.xref
# hyperref
*.brf
# knitr
*-concordance.tex
# TODO Comment the next line if you want to keep your tikz graphics files
*.tikz
*-tikzDictionary
# listings
*.lol
# luatexja-ruby
*.ltjruby
# makeidx
*.idx
*.ilg
*.ind
# minitoc
*.maf
*.mlf
*.mlt
*.mtc[0-9]*
*.slf[0-9]*
*.slt[0-9]*
*.stc[0-9]*
# minted
_minted*
*.pyg
# morewrites
*.mw
# nomencl
*.nlg
*.nlo
*.nls
# pax
*.pax
# pdfpcnotes
*.pdfpc
# sagetex
*.sagetex.sage
*.sagetex.py
*.sagetex.scmd
# scrwfile
*.wrt
# sympy
*.sout
*.sympy
sympy-plots-for-*.tex/
# pdfcomment
*.upa
*.upb
# pythontex
*.pytxcode
pythontex-files-*/
# tcolorbox
*.listing
# thmtools
*.loe
# TikZ & PGF
*.dpth
*.md5
*.auxlock
# todonotes
*.tdo
# vhistory
*.hst
*.ver
# easy-todo
*.lod
# xcolor
*.xcp
# xmpincl
*.xmpi
# xindy
*.xdy
# xypic precompiled matrices and outlines
*.xyc
*.xyd
# endfloat
*.ttt
*.fff
# Latexian
TSWLatexianTemp*
## Editors:
# WinEdt
*.bak
*.sav
# Texpad
.texpadtmp
# LyX
*.lyx~
# Kile
*.backup
# gummi
.*.swp
# KBibTeX
*~[0-9]*
# TeXnicCenter
*.tps
# auto folder when using emacs and auctex
./auto/*
*.el
# expex forward references with \gathertags
*-tags.tex
# standalone packages
*.sta
# Makeindex log files
*.lpz
# REVTeX puts footnotes in the bibliography by default, unless the nofootinbib
# option is specified. Footnotes are the stored in a file with suffix Notes.bib.
# Uncomment the next line to have this generated file ignored.
#*Notes.bib
### LaTeX Patch ###
# LIPIcs / OASIcs
*.vtc
# glossaries
*.glstex
### Python ###
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
# C extensions
*.so
# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
pip-wheel-metadata/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/
# Translations
*.mo
*.pot
# Django stuff:
local_settings.py
db.sqlite3
db.sqlite3-journal
# Flask stuff:
instance/
.webassets-cache
# Scrapy stuff:
.scrapy
# Sphinx documentation
docs/_build/
# PyBuilder
target/
# Jupyter Notebook
.ipynb_checkpoints
# IPython
profile_default/
ipython_config.py
# pyenv
.python-version
# pipenv
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
# However, in case of collaboration, if having platform-specific dependencies or dependencies
# having no cross-platform support, pipenv may install dependencies that don't work, or not
# install all needed dependencies.
#Pipfile.lock
# PEP 582; used by e.g. github.com/David-OConnor/pyflow
__pypackages__/
# Celery stuff
celerybeat-schedule
celerybeat.pid
# SageMath parsed files
*.sage.py
# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/
# Spyder project settings
.spyderproject
.spyproject
# Rope project settings
.ropeproject
# mkdocs documentation
/site
# mypy
.mypy_cache/
.dmypy.json
dmypy.json
# Pyre type checker
.pyre/
# pytype static type analyzer
.pytype/
# End of https://www.toptal.com/developers/gitignore/api/python,latex

View file

@ -0,0 +1,3 @@
# joint_paraphrase_model
joint training paraphrase model --- neurips

View file

@ -0,0 +1,112 @@
import os
import torch
# general
DEV = torch.device("cuda" if torch.cuda.is_available() else "cpu")
PAD = "<__PAD__>"
UNK = "<__UNK__>"
NOFIX = "<__NOFIX__>"
SOS = "<__SOS__>"
EOS = "<__EOS__>"
batch_size = 1
teacher_forcing_ratio = 0.5
embedding_dim = 300
fix_hidden_dim = 128
par_hidden_dim = 1024
fix_dropout = 0.5
par_dropout = 0.2
_fix_learning_rate = 0.00001
_par_learning_rate = 0.0001
learning_rate = _par_learning_rate
fix_momentum = 0.9
par_momentum = 0.0
max_length = 121
epochs = 5
# paths
data_path = "./data"
provo_predictability_path = os.path.join(
data_path, "datasets/provo/Provo_Corpus-Predictability_Norms.csv"
)
provo_eyetracking_path = os.path.join(
data_path, "datasets/provo/Provo_Corpus-Eyetracking_Data.csv"
)
geco_en_path = os.path.join(data_path, "datasets/geco/EnglishMaterial.csv")
geco_mono_path = os.path.join(data_path, "datasets/geco/MonolingualReadingData.csv")
movieqa_human_path = os.path.join(data_path, "datasets/all_word_scores_fixations")
movieqa_human_path_2 = os.path.join(
data_path, "datasets/all_word_scores_fixations_exp2"
)
movieqa_human_path_3 = os.path.join(
data_path, "datasets/all_word_scores_fixations_exp3"
)
movieqa_split_plot_path = os.path.join(data_path, "datasets/split_plot_UNRESOLVED")
cnn_path = os.path.join(
data_path,
"projects/2019/fixation_prediction/ez-reader-wrapper/predictability/output_cnn/",
)
dm_path = os.path.join(
data_path,
"projects/2019/fixation_prediction/ez-reader-wrapper/predictability/output_dm/",
)
qqp_paws_basedir = os.path.join(data_path, "datasets/paw_google/qqp/paws_qqp/output")
qqp_paws_train_path = os.path.join(qqp_paws_basedir, "train.tsv")
qqp_paws_dev_path = os.path.join(qqp_paws_basedir, "dev.tsv")
qqp_paws_test_path = os.path.join(qqp_paws_basedir, "test.tsv")
qqp_basedir = os.path.join(data_path, "datasets/Quora_question_pair_partition_OG")
qqp_train_path = os.path.join(qqp_basedir, "train.tsv")
qqp_dev_path = os.path.join(qqp_basedir, "dev.tsv")
qqp_test_path = os.path.join(qqp_basedir, "test.tsv")
qqp_kag_basedir = os.path.join(data_path, "datasets/Quora_question_pair_partition_kag")
qqp_kag_train_path = os.path.join(qqp_kag_basedir, "train.tsv")
qqp_kag_dev_path = os.path.join(qqp_kag_basedir, "dev.tsv")
qqp_kag_test_path = os.path.join(qqp_kag_basedir, "test.tsv")
wiki_basedir = os.path.join(data_path, "datasets/paw_google/wiki")
wiki_train_path = os.path.join(wiki_basedir, "train.tsv")
wiki_dev_path = os.path.join(wiki_basedir, "dev.tsv")
wiki_test_path = os.path.join(wiki_basedir, "test.tsv")
msrpc_basedir = os.path.join(data_path, "datasets/MSRPC")
msrpc_train_path = os.path.join(msrpc_basedir, "msr_paraphrase_train.txt")
msrpc_dev_path = os.path.join(msrpc_basedir, "msr_paraphrase_dev.txt")
msrpc_test_path = os.path.join(msrpc_basedir, "msr_paraphrase_test.txt")
sentiment_basedir = os.path.join(data_path, "datasets/sentiment_kag")
sentiment_train_path = os.path.join(sentiment_basedir, "train.tsv")
sentiment_dev_path = os.path.join(sentiment_basedir, "dev.tsv")
sentiment_test_path = os.path.join(sentiment_basedir, "test.tsv")
tamil_basedir = os.path.join(data_path, "datasets/en-ta-parallel-v2")
tamil_train_path = os.path.join(tamil_basedir, "corpus.bcn.train.enta")
tamil_dev_path = os.path.join(tamil_basedir, "corpus.bcn.dev.enta")
tamil_test_path = os.path.join(tamil_basedir, "corpus.bcn.test.enta")
compression_basedir = os.path.join(data_path, "datasets/sentence-compression/data")
compression_train_path = os.path.join(compression_basedir, "train.tsv")
compression_dev_path = os.path.join(compression_basedir, "dev.tsv")
compression_test_path = os.path.join(compression_basedir, "test.tsv")
stanford_basedir = os.path.join(data_path, "datasets/stanfordSentimentTreebank")
stanford_train_path = os.path.join(stanford_basedir, "train.tsv")
stanford_dev_path = os.path.join(stanford_basedir, "dev.tsv")
stanford_test_path = os.path.join(stanford_basedir, "test.tsv")
stanford_sent_basedir = os.path.join(data_path, "datasets/stanfordSentimentTreebank")
stanford_sent_train_path = os.path.join(stanford_basedir, "train_sent.tsv")
stanford_sent_dev_path = os.path.join(stanford_basedir, "dev_sent.tsv")
stanford_sent_test_path = os.path.join(stanford_basedir, "test_sent.tsv")
emb_path = os.path.join(data_path, "Google_word2vec/GoogleNews-vectors-negative300.bin")
glove_path = "glove.840B.300d.txt"

1
joint_paraphrase_model/data Symbolic link
View file

@ -0,0 +1 @@
/netpool/work/gpu-2/users/soodea/

View file

@ -0,0 +1 @@
/netpool/work/gpu-2/users/soodea/datasets/glove/glove.840B.300d.txt

File diff suppressed because one or more lines are too long

View file

View file

@ -0,0 +1,416 @@
import logging
import config
def tokenize(sent):
return sent.split(" ")
class Lang:
"""Represents the vocabulary
"""
def __init__(self, name):
self.name = name
self.word2index = {
config.PAD: 0,
config.UNK: 1,
config.NOFIX: 2,
config.SOS: 3,
config.EOS: 4,
}
self.word2count = {}
self.index2word = {
0: config.PAD,
1: config.UNK,
2: config.NOFIX,
3: config.SOS,
4: config.EOS,
}
self.n_words = 5
def add_sentence(self, sentence):
assert isinstance(
sentence, (list, tuple)
), "input to add_sentence must be tokenized"
for word in sentence:
self.add_word(word)
def add_word(self, word):
if word not in self.word2index:
self.word2index[word] = self.n_words
self.word2count[word] = 1
self.index2word[self.n_words] = word
self.n_words += 1
else:
self.word2count[word] += 1
def __add__(self, other):
"""Returns a new Lang object containing the vocabulary from this and
the other Lang object
"""
new_lang = Lang(f"{self.name}_{other.name}")
# Add vocabulary from both Langs
for word in self.word2count.keys():
new_lang.add_word(word)
for word in other.word2count.keys():
new_lang.add_word(word)
# Fix the counts on the new one
for word in new_lang.word2count.keys():
new_lang.word2count[word] = self.word2count.get(
word, 0
) + other.word2count.get(word, 0)
return new_lang
def load_wiki(split):
"""Load the Wiki from PAWs"""
logger = logging.getLogger(f"{__name__}.load_wiki")
lang = Lang("wiki")
if split == "train":
path = config.wiki_train_path
elif split == "val":
path = config.wiki_dev_path
elif split == "test":
path = config.wiki_test_path
logger.info("loading %s from %s" % (split, path))
pairs = []
with open(path) as handle:
# skip header
handle.readline()
for line in handle:
_, sent1, sent2, rating = line.strip().split("\t")
if rating == "0":
continue
sent1 = tokenize(sent1)
sent2 = tokenize(sent2)
lang.add_sentence(sent1)
lang.add_sentence(sent2)
# pairs.append([sent1, sent2, rating])
pairs.append([sent1, sent2])
# MS makes the vocab for paraphrase the same
return pairs, lang
def load_qqp_paws(split):
"""Load the QQP from PAWs"""
logger = logging.getLogger(f"{__name__}.load_qqp_paws")
lang = Lang("qqp_paws")
if split == "train":
path = config.qqp_paws_train_path
elif split == "val":
path = config.qqp_paws_dev_path
elif split == "test":
path = config.qqp_paws_test_path
logger.info("loading %s from %s" % (split, path))
pairs = []
with open(path) as handle:
# skip header
handle.readline()
for line in handle:
_, sent1, sent2, rating = line.strip().split("\t")
if rating == "0":
continue
sent1 = tokenize(sent1)
sent2 = tokenize(sent2)
lang.add_sentence(sent1)
lang.add_sentence(sent2)
# pairs.append([sent1, sent2, rating])
pairs.append([sent1, sent2])
# MS makes the vocab for paraphrase the same
return pairs, lang
def load_qqp(split):
"""Load the QQP from Original"""
logger = logging.getLogger(f"{__name__}.load_qqp")
lang = Lang("qqp")
if split == "train":
path = config.qqp_train_path
elif split == "val":
path = config.qqp_dev_path
elif split == "test":
path = config.qqp_test_path
logger.info("loading %s from %s" % (split, path))
pairs = []
with open(path) as handle:
# skip header
handle.readline()
for line in handle:
rating, sent1, sent2, _ = line.strip().split("\t")
if rating == "0":
continue
sent1 = tokenize(sent1)
sent2 = tokenize(sent2)
lang.add_sentence(sent1)
lang.add_sentence(sent2)
# pairs.append([sent1, sent2, rating])
pairs.append([sent1, sent2])
# MS makes the vocab for paraphrase the same
return pairs, lang
def load_qqp_kag(split):
"""Load the QQP from Kaggle""" #not original right now, expriemnting with kaggle 100K, 3K, 30K split
logger = logging.getLogger(f"{__name__}.load_qqp_kag")
lang = Lang("qqp_kag")
if split == "train":
path = config.qqp_kag_train_path
elif split == "val":
path = config.qqp_kag_dev_path
elif split == "test":
path = config.qqp_kag_test_path
logger.info("loading %s from %s" % (split, path))
pairs = []
with open(path) as handle:
# skip header
handle.readline()
for line in handle: #when reading the kag version we do not have 4 fields, but rather 3
rating, sent1, sent2 = line.strip().split("\t")
if rating == "0":
continue
sent1 = tokenize(sent1)
sent2 = tokenize(sent2)
lang.add_sentence(sent1)
lang.add_sentence(sent2)
# pairs.append([sent1, sent2, rating])
pairs.append([sent1, sent2])
# MS makes the vocab for paraphrase the same
return pairs, lang
def load_msrpc(split):
"""Load the Microsoft Research Paraphrase Corpus (MSRPC)"""
logger = logging.getLogger(f"{__name__}.load_msrpc")
lang = Lang("msrpc")
if split == "train":
path = config.msrpc_train_path
elif split == "val":
path = config.msrpc_dev_path
elif split == "test":
path = config.msrpc_test_path
logger.info("loading %s from %s" % (split, path))
pairs = []
with open(path) as handle:
# skip header
handle.readline()
for line in handle:
rating, _, _, sent1, sent2 = line.strip().split("\t")
if rating == "0":
continue
sent1 = tokenize(sent1)
sent2 = tokenize(sent2)
lang.add_sentence(sent1)
lang.add_sentence(sent2)
# pairs.append([sent1, sent2, rating])
pairs.append([sent1, sent2])
# return src_lang, dst_lang, pairs
# MS makes the vocab for paraphrase the same
return pairs, lang
def load_sentiment(split):
"""Load the Sentiment Kaggle Comp Dataset"""
logger = logging.getLogger(f"{__name__}.load_sentiment")
lang = Lang("sentiment")
if split == "train":
path = config.sentiment_train_path
elif split == "val":
path = config.sentiment_dev_path
elif split == "test":
path = config.sentiment_test_path
logger.info("loading %s from %s" % (split, path))
pairs = []
with open(path) as handle:
# skip header
handle.readline()
for line in handle:
_, _, sent1, sent2 = line.strip().split("\t")
sent1 = tokenize(sent1)
sent2 = tokenize(sent2)
lang.add_sentence(sent1)
lang.add_sentence(sent2)
# pairs.append([sent1, sent2, rating])
pairs.append([sent1, sent2])
return pairs, lang
def load_tamil(split):
"""Load the En to Tamil dataset, current SOTA ~13 bleu"""
logger = logging.getLogger(f"{__name__}.load_tamil")
lang = Lang("tamil")
if split == "train":
path = config.tamil_train_path
elif split == "val":
path = config.tamil_dev_path
elif split == "test":
path = config.tamil_test_path
logger.info("loading %s from %s" % (split, path))
pairs = []
with open(path) as handle:
handle.readline()
for line in handle:
sent1, sent2 = line.strip().split("\t")
#if rating == "0":
# continue
sent1 = tokenize(sent1)
#I dunno how to tokenize tamil.....?
sent2 = tokenize(sent2)
lang.add_sentence(sent1)
lang.add_sentence(sent2)
pairs.append([sent1, sent2])
return pairs, lang
def load_compression(split):
"""Load the Google Sentence Compression Dataset"""
logger = logging.getLogger(f"{__name__}.load_compression")
lang = Lang("compression")
if split == "train":
path = config.compression_train_path
elif split == "val":
path = config.compression_dev_path
elif split == "test":
path = config.compression_test_path
logger.info("loading %s from %s" % (split, path))
pairs = []
with open(path) as handle:
handle.readline()
for line in handle:
sent1, sent2 = line.strip().split("\t")
sent1 = tokenize(sent1)
sent2 = tokenize(sent2)
# print(len(sent1), sent1)
# print(len(sent2), sent2)
# print()
lang.add_sentence(sent1)
lang.add_sentence(sent2)
pairs.append([sent1, sent2])
return pairs, lang
def load_stanford(split):
"""Load the Stanford Sentiment Dataset phrases"""
logger = logging.getLogger(f"{__name__}.load_stanford")
lang = Lang("stanford")
if split == "train":
path = config.stanford_train_path
elif split == "val":
path = config.stanford_dev_path
elif split == "test":
path = config.stanford_test_path
logger.info("loading %s from %s" % (split, path))
pairs = []
with open(path) as handle:
# skip header
#handle.readline()
for line in handle:
_, _, sent1, sent2 = line.strip().split("\t")
sent1 = tokenize(sent1)
sent2 = tokenize(sent2)
lang.add_sentence(sent1)
lang.add_sentence(sent2)
# pairs.append([sent1, sent2, rating])
pairs.append([sent1, sent2])
return pairs, lang
def load_stanford_sent(split):
"""Load the Stanford Sentiment Dataset sentences"""
logger = logging.getLogger(f"{__name__}.load_stanford_sent")
lang = Lang("stanford_sent")
if split == "train":
path = config.stanford_sent_train_path
elif split == "val":
path = config.stanford_sent_dev_path
elif split == "test":
path = config.stanford_sent_test_path
logger.info("loading %s from %s" % (split, path))
pairs = []
with open(path) as handle:
# skip header
#handle.readline()
for line in handle:
_, _, sent1, sent2 = line.strip().split("\t")
sent1 = tokenize(sent1)
sent2 = tokenize(sent2)
lang.add_sentence(sent1)
lang.add_sentence(sent2)
# pairs.append([sent1, sent2, rating])
pairs.append([sent1, sent2])
return pairs, lang

View file

@ -0,0 +1 @@
from .main import *

View file

@ -0,0 +1,131 @@
from collections import OrderedDict
import logging
import sys
from .self_attention import Transformer
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn.utils.rnn import pack_sequence, pack_padded_sequence, pad_packed_sequence
def random_embedding(vocab_size, embedding_dim):
pretrain_emb = np.empty([vocab_size, embedding_dim])
scale = np.sqrt(3.0 / embedding_dim)
for index in range(vocab_size):
pretrain_emb[index, :] = np.random.uniform(-scale, scale, [1, embedding_dim])
return pretrain_emb
def neg_log_likelihood_loss(outputs, batch_label, batch_size, seq_len):
outputs = outputs.view(batch_size * seq_len, -1)
score = F.log_softmax(outputs, 1)
loss = nn.NLLLoss(ignore_index=0, size_average=False)(
score, batch_label.view(batch_size * seq_len)
)
loss = loss / batch_size
_, tag_seq = torch.max(score, 1)
tag_seq = tag_seq.view(batch_size, seq_len)
# print(score[0], tag_seq[0])
return loss, tag_seq
def mse_loss(outputs, batch_label, batch_size, seq_len, word_seq_length):
# score = torch.nn.functional.softmax(outputs, 1)
score = torch.sigmoid(outputs)
mask = torch.zeros_like(score)
for i, v in enumerate(word_seq_length):
mask[i, 0:v] = 1
score = score * mask
loss = nn.MSELoss(reduction="sum")(
score.view(batch_size, seq_len), batch_label.view(batch_size, seq_len)
)
loss = loss / batch_size
return loss, score.view(batch_size, seq_len)
class Network(nn.Module):
def __init__(
self,
embedding_type,
vocab_size,
embedding_dim,
dropout,
hidden_dim,
embeddings=None,
attention=True,
):
super().__init__()
self.logger = logging.getLogger(f"{__name__}")
prelayers = OrderedDict()
postlayers = OrderedDict()
if embedding_type in ("w2v", "glove"):
if embeddings is not None:
prelayers["embedding_layer"] = nn.Embedding.from_pretrained(embeddings)
else:
prelayers["embedding_layer"] = nn.Embedding(vocab_size, embedding_dim)
prelayers["embedding_dropout_layer"] = nn.Dropout(dropout)
embedding_dim = 300
elif embedding_type == "bert":
embedding_dim = 768
self.lstm = BiLSTM(embedding_dim, hidden_dim // 2, num_layers=1)
postlayers["lstm_dropout_layer"] = nn.Dropout(dropout)
if attention:
# increased compl with 1024D, and 16,16: for no att and att experiments
# before: for the initial att and pretraining: heads 4 and layers 4, 128D
# then was 128 D with heads 4 layer 1 = results for all IUI
###postlayers["position_encodings"] = PositionalEncoding(hidden_dim)
postlayers["attention_layer"] = Transformer(
d_model=hidden_dim, n_heads=4, n_layers=1
)
postlayers["ff_layer"] = nn.Linear(hidden_dim, hidden_dim // 2)
postlayers["ff_activation"] = nn.ReLU()
postlayers["output_layer"] = nn.Linear(hidden_dim // 2, 1)
self.logger.info(f"prelayers: {prelayers.keys()}")
self.logger.info(f"postlayers: {postlayers.keys()}")
self.pre = nn.Sequential(prelayers)
self.post = nn.Sequential(postlayers)
def forward(self, x, word_seq_length):
x = self.pre(x)
x = self.lstm(x, word_seq_length)
#MS pritning fix model params
#for p in self.parameters():
# print(p.data)
# break
return self.post(x.transpose(1, 0))
class BiLSTM(nn.Module):
def __init__(self, embedding_dim, lstm_hidden, num_layers):
super().__init__()
self.net = nn.LSTM(
input_size=embedding_dim,
hidden_size=lstm_hidden,
num_layers=num_layers,
batch_first=True,
bidirectional=True,
)
def forward(self, x, word_seq_length):
packed_words = pack_padded_sequence(x, word_seq_length, True, False)
lstm_out, hidden = self.net(packed_words)
lstm_out, _ = pad_packed_sequence(lstm_out)
return lstm_out

View file

@ -0,0 +1,131 @@
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import math
class PositionalEncoding(nn.Module):
def __init__(self, d_hid, n_position=200):
super(PositionalEncoding, self).__init__()
# Not a parameter
self.register_buffer('pos_table', self._get_sinusoid_encoding_table(n_position, d_hid))
def _get_sinusoid_encoding_table(self, n_position, d_hid):
''' Sinusoid position encoding table '''
# TODO: make it with torch instead of numpy
def get_position_angle_vec(position):
return [position / np.power(10000, 2 * (hid_j // 2) / d_hid) for hid_j in range(d_hid)]
sinusoid_table = np.array([get_position_angle_vec(pos_i) for pos_i in range(n_position)])
sinusoid_table[:, 0::2] = np.sin(sinusoid_table[:, 0::2]) # dim 2i
sinusoid_table[:, 1::2] = np.cos(sinusoid_table[:, 1::2]) # dim 2i+1
return torch.FloatTensor(sinusoid_table).unsqueeze(0)
def forward(self, x):
return x + self.pos_table[:, :x.size(1)].clone().detach()
class AttentionLayer(nn.Module):
def __init__(self):
super(AttentionLayer, self).__init__()
def forward(self, Q, K, V):
# Q: float32:[batch_size, n_queries, d_k]
# K: float32:[batch_size, n_keys, d_k]
# V: float32:[batch_size, n_keys, d_v]
dk = K.shape[-1]
dv = V.shape[-1]
KT = torch.transpose(K, -1, -2)
weight_logits = torch.bmm(Q, KT) / math.sqrt(dk)
# weight_logits: float32[batch_size, n_queries, n_keys]
weights = F.softmax(weight_logits, dim=-1)
# weight: float32[batch_size, n_queries, n_keys]
return torch.bmm(weights, V)
# return float32[batch_size, n_queries, dv]
class MultiHeadedSelfAttentionLayer(nn.Module):
def __init__(self, d_model, n_heads):
super(MultiHeadedSelfAttentionLayer, self).__init__()
self.d_model = d_model
self.n_heads = n_heads
print('{} {}'.format(d_model, n_heads))
assert d_model % n_heads == 0
self.d_k = d_model // n_heads
self.d_v = self.d_k
self.attention_layer = AttentionLayer()
self.W_Qs = nn.ModuleList([
nn.Linear(d_model, self.d_k, bias=False)
for _ in range(n_heads)
])
self.W_Ks = nn.ModuleList([
nn.Linear(d_model, self.d_k, bias=False)
for _ in range(n_heads)
])
self.W_Vs = nn.ModuleList([
nn.Linear(d_model, self.d_v, bias=False)
for _ in range(n_heads)
])
self.W_O = nn.Linear(d_model, d_model, bias=False)
def forward(self, x):
# x:float32[batch_size, sequence_length, self.d_model]
head_outputs = []
for W_Q, W_K, W_V in zip(self.W_Qs, self.W_Ks, self.W_Vs):
Q = W_Q(x)
# Q float32:[batch_size, sequence_length, self.d_k]
K = W_K(x)
# Q float32:[batch_size, sequence_length, self.d_k]
V = W_V(x)
# Q float32:[batch_size, sequence_length, self.d_v]
head_output = self.attention_layer(Q, K, V)
# float32:[batch_size, sequence_length, self.d_v]
head_outputs.append(head_output)
concatenated = torch.cat(head_outputs, dim=-1)
# concatenated float32:[batch_size, sequence_length, self.d_model]
out = self.W_O(concatenated)
# out float32:[batch_size, sequence_length, self.d_model]
return out
class Feedforward(nn.Module):
def __init__(self, d_model):
super(Feedforward, self).__init__()
self.d_model = d_model
self.W1 = nn.Linear(d_model, d_model)
self.W2 = nn.Linear(d_model, d_model)
def forward(self, x):
# x: float32[batch_size, sequence_length, d_model]
return self.W2(torch.relu(self.W1(x)))
class Transformer(nn.Module):
def __init__(self, d_model, n_heads, n_layers):
super(Transformer, self).__init__()
self.d_model = d_model
self.n_heads = n_heads
self.n_layers = n_layers
self.attention_layers = nn.ModuleList([
MultiHeadedSelfAttentionLayer(d_model, n_heads)
for _ in range(n_layers)
])
self.ffs = nn.ModuleList([
Feedforward(d_model)
for _ in range(n_layers)
])
def forward(self, x):
# x: float32[batch_size, sequence_length, self.d_model]
for attention_layer, ff in zip(self.attention_layers, self.ffs):
attention_out = attention_layer(x)
# attention_out: float32[batch_size, sequence_length, self.d_model]
x = F.layer_norm(x + attention_out, x.shape[2:])
ff_out = ff(x)
# ff_out: float32[batch_size, sequence_length, self.d_model]
x = F.layer_norm(x + ff_out, x.shape[2:])
return x

View file

@ -0,0 +1 @@
from .main import *

View file

@ -0,0 +1,86 @@
import json
import math
import os
import random
import time
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
class EncoderRNN(nn.Module):
def __init__(self, input_size, hidden_size, embeddings):
super(EncoderRNN, self).__init__()
self.hidden_size = hidden_size
self.embedding = nn.Embedding.from_pretrained(embeddings)
self.gru = nn.GRU(input_size, hidden_size)
def forward(self, input, hidden):
embedded = self.embedding(input).view(1, 1, -1)
output = embedded
output, hidden = self.gru(output, hidden)
return output, hidden
def initHidden(self):
return torch.zeros(1, 1, self.hidden_size)
class AttnDecoderRNN(nn.Module):
def __init__(
self,
input_size,
hidden_size,
output_size,
embeddings,
dropout_p,
max_length,
):
super(AttnDecoderRNN, self).__init__()
self.input_size = input_size
self.hidden_size = hidden_size
self.output_size = output_size
self.dropout_p = dropout_p
self.max_length = max_length
self.embedding = nn.Embedding.from_pretrained(embeddings) #for paragen
#self.embedding = nn.Embedding(len(embeddings), 300) #for NMT with tamil, trying wiht senitment too
self.attn = nn.Linear(self.input_size + self.hidden_size, self.max_length)
self.attn_combine = nn.Linear(
self.input_size + self.hidden_size, self.hidden_size
)
self.dropout = nn.Dropout(self.dropout_p)
self.gru = nn.GRU(self.hidden_size, self.hidden_size)
self.out = nn.Linear(self.hidden_size, self.output_size)
def forward(self, input, hidden, encoder_outputs, fixations):
embedded = self.embedding(input).view(1, 1, -1)
embedded = self.dropout(embedded)
attn_weights = F.softmax(
self.attn(torch.cat((embedded[0], hidden[0]), 1)), dim=1
)
attn_weights = attn_weights * torch.nn.ConstantPad1d((0, attn_weights.shape[-1] - fixations.shape[-2]), 0)(fixations.squeeze().unsqueeze(0))
# attn_weights = torch.softmax(attn_weights * torch.nn.ConstantPad1d((0, attn_weights.shape[-1] - fixations.shape[-2]), 0)(fixations.squeeze().unsqueeze(0)), dim=1)
attn_applied = torch.bmm(
attn_weights.unsqueeze(0), encoder_outputs.unsqueeze(0)
)
output = torch.cat((embedded[0], attn_applied[0]), 1)
output = self.attn_combine(output).unsqueeze(0)
output = F.relu(output)
output, hidden = self.gru(output, hidden)
# output = F.log_softmax(self.out(output[0]), dim=1)
output = self.out(output[0])
# output = F.log_softmax(output, dim=1)
return output, hidden, attn_weights

View file

@ -0,0 +1,225 @@
import json
import logging
import math
import os
import random
import re
import time
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
from nltk.translate.bleu_score import sentence_bleu
import numpy as np
import torch
import torch.nn as nn
import config
plt.switch_backend("agg")
def load_glove(vocabulary):
logger = logging.getLogger(f"{__name__}.load_glove")
logger.info("loading embeddings")
try:
with open(f"glove.cache") as h:
cache = json.load(h)
except:
logger.info("cache doesn't exist")
cache = {}
cache[config.PAD] = [0] * 300
cache[config.SOS] = [0] * 300
cache[config.EOS] = [0] * 300
cache[config.UNK] = [0] * 300
cache[config.NOFIX] = [0] * 300
else:
logger.info("cache found")
cache_miss = False
if not set(vocabulary) <= set(cache):
cache_miss = True
logger.warn("cache miss, loading full embeddings")
data = {}
with open("glove.840B.300d.txt") as h:
for line in h:
word, *emb = line.strip().split()
try:
data[word] = [float(x) for x in emb]
except:
continue
logger.info("finished loading full embeddings")
for word in vocabulary:
try:
cache[word] = data[word]
except KeyError:
cache[word] = [0] * 300
logger.info("cache updated")
embeddings = []
for word in vocabulary:
embeddings.append(torch.tensor(cache[word], dtype=torch.float32))
embeddings = torch.stack(embeddings)
if cache_miss:
with open(f"glove.cache", "w") as h:
json.dump(cache, h)
logger.info("cache saved")
return embeddings
def tokenize(s):
s = s.lower().strip()
s = re.sub(r"([.!?])", r" \1", s)
s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
s = s.split(" ")
return s
def indices_from_sentence(word2index, sentence, unknown_threshold):
if unknown_threshold:
return [
word2index.get(
word if random.random() > unknown_threshold else config.UNK,
word2index[config.UNK],
)
for word in sentence
]
else:
return [
word2index.get(word, word2index[config.UNK]) for word in sentence
]
def tensor_from_sentence(word2index, sentence, unknown_threshold):
# indices = [config.SOS]
indices = indices_from_sentence(word2index, sentence, unknown_threshold)
indices.append(word2index[config.EOS])
return torch.tensor(indices, dtype=torch.long, device=config.DEV)
def tensors_from_pair(word2index, pair, shuffle, unknown_threshold):
tensors = [
tensor_from_sentence(word2index, pair[0], unknown_threshold),
tensor_from_sentence(word2index, pair[1], unknown_threshold),
]
if shuffle:
random.shuffle(tensors)
return tensors
def bleu(reference, hypothesis, n=4): #not sure if this actually changes the n gram
if n < 1:
return 0
weights = [1/n]*n
return sentence_bleu([reference], hypothesis, weights)
def pair_iter(pairs, word2index, shuffle=False, shuffle_pairs=False, unknown_threshold=0.00):
if shuffle:
pairs = pairs.copy()
random.shuffle(pairs)
for pair in pairs:
tensor1, tensor2 = tensors_from_pair(word2index, (pair[0], pair[1]), shuffle_pairs, unknown_threshold)
yield (tensor1,), (tensor2,)
def sent_iter(sents, word2index, unknown_threshold=0.00):
for sent in sents:
tensor = tensor_from_sentence(word2index, sent, unknown_threshold)
yield (tensor,)
def batch_iter(pairs, word2index, batch_size, shuffle=False, unknown_threshold=0.00):
for i in range(len(pairs) // batch_size):
batch = pairs[i : i + batch_size]
if len(batch) != batch_size:
continue
batch_tensors = [
tensors_from_pair(word2index, (pair[0], pair[1]), shuffle, unknown_threshold)
for pair in batch
]
tensors1, tensors2 = zip(*batch_tensors)
# targets = torch.tensor(targets, dtype=torch.long, device=config.DEV)
# tensors1_lengths = [len(t) for t in tensors1]
# tensors2_lengths = [len(t) for t in tensors2]
# tensors1 = nn.utils.rnn.pack_sequence(tensors1, enforce_sorted=False)
# tensors2 = nn.utils.rnn.pack_sequence(tensors2, enforce_sorted=False)
yield tensors1, tensors2
def asMinutes(s):
m = math.floor(s / 60)
s -= m * 60
return "%dm %ds" % (m, s)
def timeSince(since, percent):
now = time.time()
s = now - since
es = s / (percent)
rs = es - s
return "%s (- %s)" % (asMinutes(s), asMinutes(rs))
def showPlot(points):
plt.figure()
fig, ax = plt.subplots()
# this locator puts ticks at regular intervals
loc = ticker.MultipleLocator(base=0.2)
ax.yaxis.set_major_locator(loc)
plt.plot(points)
def showAttention(input_sentence, output_words, attentions):
# Set up figure with colorbar
fig = plt.figure()
ax = fig.add_subplot(111)
cax = ax.matshow(attentions.numpy(), cmap="bone")
fig.colorbar(cax)
# Set up axes
ax.set_xticklabels([""] + input_sentence.split(" ") + ["<__EOS__>"], rotation=90)
ax.set_yticklabels([""] + output_words)
# Show label at every tick
ax.xaxis.set_major_locator(ticker.MultipleLocator(1))
ax.yaxis.set_major_locator(ticker.MultipleLocator(1))
plt.show()
def evaluateAndShowAttention(input_sentence):
output_words, attentions = evaluate(encoder1, attn_decoder1, input_sentence)
print("input =", input_sentence)
print("output =", " ".join(output_words))
showAttention(input_sentence, output_words, attentions)
def save_model(model,