Add NLP task models
This commit is contained in:
parent
d8beb17dfb
commit
69f6de0ace
46 changed files with 4976 additions and 0 deletions
428
joint_paraphrase_model/.gitignore
vendored
Normal file
428
joint_paraphrase_model/.gitignore
vendored
Normal file
|
@ -0,0 +1,428 @@
|
|||
|
||||
# Created by https://www.toptal.com/developers/gitignore/api/python,latex
|
||||
# Edit at https://www.toptal.com/developers/gitignore?templates=python,latex
|
||||
|
||||
### LaTeX ###
|
||||
## Core latex/pdflatex auxiliary files:
|
||||
*.aux
|
||||
*.lof
|
||||
*.log
|
||||
*.lot
|
||||
*.fls
|
||||
*.out
|
||||
*.toc
|
||||
*.fmt
|
||||
*.fot
|
||||
*.cb
|
||||
*.cb2
|
||||
.*.lb
|
||||
|
||||
## Intermediate documents:
|
||||
*.dvi
|
||||
*.xdv
|
||||
*-converted-to.*
|
||||
# these rules might exclude image files for figures etc.
|
||||
# *.ps
|
||||
# *.eps
|
||||
# *.pdf
|
||||
|
||||
## Generated if empty string is given at "Please type another file name for output:"
|
||||
.pdf
|
||||
|
||||
## Bibliography auxiliary files (bibtex/biblatex/biber):
|
||||
*.bbl
|
||||
*.bcf
|
||||
*.blg
|
||||
*-blx.aux
|
||||
*-blx.bib
|
||||
*.run.xml
|
||||
|
||||
## Build tool auxiliary files:
|
||||
*.fdb_latexmk
|
||||
*.synctex
|
||||
*.synctex(busy)
|
||||
*.synctex.gz
|
||||
*.synctex.gz(busy)
|
||||
*.pdfsync
|
||||
|
||||
## Build tool directories for auxiliary files
|
||||
# latexrun
|
||||
latex.out/
|
||||
|
||||
## Auxiliary and intermediate files from other packages:
|
||||
# algorithms
|
||||
*.alg
|
||||
*.loa
|
||||
|
||||
# achemso
|
||||
acs-*.bib
|
||||
|
||||
# amsthm
|
||||
*.thm
|
||||
|
||||
# beamer
|
||||
*.nav
|
||||
*.pre
|
||||
*.snm
|
||||
*.vrb
|
||||
|
||||
# changes
|
||||
*.soc
|
||||
|
||||
# comment
|
||||
*.cut
|
||||
|
||||
# cprotect
|
||||
*.cpt
|
||||
|
||||
# elsarticle (documentclass of Elsevier journals)
|
||||
*.spl
|
||||
|
||||
# endnotes
|
||||
*.ent
|
||||
|
||||
# fixme
|
||||
*.lox
|
||||
|
||||
# feynmf/feynmp
|
||||
*.mf
|
||||
*.mp
|
||||
*.t[1-9]
|
||||
*.t[1-9][0-9]
|
||||
*.tfm
|
||||
|
||||
#(r)(e)ledmac/(r)(e)ledpar
|
||||
*.end
|
||||
*.?end
|
||||
*.[1-9]
|
||||
*.[1-9][0-9]
|
||||
*.[1-9][0-9][0-9]
|
||||
*.[1-9]R
|
||||
*.[1-9][0-9]R
|
||||
*.[1-9][0-9][0-9]R
|
||||
*.eledsec[1-9]
|
||||
*.eledsec[1-9]R
|
||||
*.eledsec[1-9][0-9]
|
||||
*.eledsec[1-9][0-9]R
|
||||
*.eledsec[1-9][0-9][0-9]
|
||||
*.eledsec[1-9][0-9][0-9]R
|
||||
|
||||
# glossaries
|
||||
*.acn
|
||||
*.acr
|
||||
*.glg
|
||||
*.glo
|
||||
*.gls
|
||||
*.glsdefs
|
||||
*.lzo
|
||||
*.lzs
|
||||
|
||||
# uncomment this for glossaries-extra (will ignore makeindex's style files!)
|
||||
# *.ist
|
||||
|
||||
# gnuplottex
|
||||
*-gnuplottex-*
|
||||
|
||||
# gregoriotex
|
||||
*.gaux
|
||||
*.gtex
|
||||
|
||||
# htlatex
|
||||
*.4ct
|
||||
*.4tc
|
||||
*.idv
|
||||
*.lg
|
||||
*.trc
|
||||
*.xref
|
||||
|
||||
# hyperref
|
||||
*.brf
|
||||
|
||||
# knitr
|
||||
*-concordance.tex
|
||||
# TODO Comment the next line if you want to keep your tikz graphics files
|
||||
*.tikz
|
||||
*-tikzDictionary
|
||||
|
||||
# listings
|
||||
*.lol
|
||||
|
||||
# luatexja-ruby
|
||||
*.ltjruby
|
||||
|
||||
# makeidx
|
||||
*.idx
|
||||
*.ilg
|
||||
*.ind
|
||||
|
||||
# minitoc
|
||||
*.maf
|
||||
*.mlf
|
||||
*.mlt
|
||||
*.mtc[0-9]*
|
||||
*.slf[0-9]*
|
||||
*.slt[0-9]*
|
||||
*.stc[0-9]*
|
||||
|
||||
# minted
|
||||
_minted*
|
||||
*.pyg
|
||||
|
||||
# morewrites
|
||||
*.mw
|
||||
|
||||
# nomencl
|
||||
*.nlg
|
||||
*.nlo
|
||||
*.nls
|
||||
|
||||
# pax
|
||||
*.pax
|
||||
|
||||
# pdfpcnotes
|
||||
*.pdfpc
|
||||
|
||||
# sagetex
|
||||
*.sagetex.sage
|
||||
*.sagetex.py
|
||||
*.sagetex.scmd
|
||||
|
||||
# scrwfile
|
||||
*.wrt
|
||||
|
||||
# sympy
|
||||
*.sout
|
||||
*.sympy
|
||||
sympy-plots-for-*.tex/
|
||||
|
||||
# pdfcomment
|
||||
*.upa
|
||||
*.upb
|
||||
|
||||
# pythontex
|
||||
*.pytxcode
|
||||
pythontex-files-*/
|
||||
|
||||
# tcolorbox
|
||||
*.listing
|
||||
|
||||
# thmtools
|
||||
*.loe
|
||||
|
||||
# TikZ & PGF
|
||||
*.dpth
|
||||
*.md5
|
||||
*.auxlock
|
||||
|
||||
# todonotes
|
||||
*.tdo
|
||||
|
||||
# vhistory
|
||||
*.hst
|
||||
*.ver
|
||||
|
||||
# easy-todo
|
||||
*.lod
|
||||
|
||||
# xcolor
|
||||
*.xcp
|
||||
|
||||
# xmpincl
|
||||
*.xmpi
|
||||
|
||||
# xindy
|
||||
*.xdy
|
||||
|
||||
# xypic precompiled matrices and outlines
|
||||
*.xyc
|
||||
*.xyd
|
||||
|
||||
# endfloat
|
||||
*.ttt
|
||||
*.fff
|
||||
|
||||
# Latexian
|
||||
TSWLatexianTemp*
|
||||
|
||||
## Editors:
|
||||
# WinEdt
|
||||
*.bak
|
||||
*.sav
|
||||
|
||||
# Texpad
|
||||
.texpadtmp
|
||||
|
||||
# LyX
|
||||
*.lyx~
|
||||
|
||||
# Kile
|
||||
*.backup
|
||||
|
||||
# gummi
|
||||
.*.swp
|
||||
|
||||
# KBibTeX
|
||||
*~[0-9]*
|
||||
|
||||
# TeXnicCenter
|
||||
*.tps
|
||||
|
||||
# auto folder when using emacs and auctex
|
||||
./auto/*
|
||||
*.el
|
||||
|
||||
# expex forward references with \gathertags
|
||||
*-tags.tex
|
||||
|
||||
# standalone packages
|
||||
*.sta
|
||||
|
||||
# Makeindex log files
|
||||
*.lpz
|
||||
|
||||
# REVTeX puts footnotes in the bibliography by default, unless the nofootinbib
|
||||
# option is specified. Footnotes are the stored in a file with suffix Notes.bib.
|
||||
# Uncomment the next line to have this generated file ignored.
|
||||
#*Notes.bib
|
||||
|
||||
### LaTeX Patch ###
|
||||
# LIPIcs / OASIcs
|
||||
*.vtc
|
||||
|
||||
# glossaries
|
||||
*.glstex
|
||||
|
||||
### Python ###
|
||||
# Byte-compiled / optimized / DLL files
|
||||
__pycache__/
|
||||
*.py[cod]
|
||||
*$py.class
|
||||
|
||||
# C extensions
|
||||
*.so
|
||||
|
||||
# Distribution / packaging
|
||||
.Python
|
||||
build/
|
||||
develop-eggs/
|
||||
dist/
|
||||
downloads/
|
||||
eggs/
|
||||
.eggs/
|
||||
lib/
|
||||
lib64/
|
||||
parts/
|
||||
sdist/
|
||||
var/
|
||||
wheels/
|
||||
pip-wheel-metadata/
|
||||
share/python-wheels/
|
||||
*.egg-info/
|
||||
.installed.cfg
|
||||
*.egg
|
||||
MANIFEST
|
||||
|
||||
# PyInstaller
|
||||
# Usually these files are written by a python script from a template
|
||||
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
||||
*.manifest
|
||||
*.spec
|
||||
|
||||
# Installer logs
|
||||
pip-log.txt
|
||||
pip-delete-this-directory.txt
|
||||
|
||||
# Unit test / coverage reports
|
||||
htmlcov/
|
||||
.tox/
|
||||
.nox/
|
||||
.coverage
|
||||
.coverage.*
|
||||
.cache
|
||||
nosetests.xml
|
||||
coverage.xml
|
||||
*.cover
|
||||
*.py,cover
|
||||
.hypothesis/
|
||||
.pytest_cache/
|
||||
|
||||
# Translations
|
||||
*.mo
|
||||
*.pot
|
||||
|
||||
# Django stuff:
|
||||
local_settings.py
|
||||
db.sqlite3
|
||||
db.sqlite3-journal
|
||||
|
||||
# Flask stuff:
|
||||
instance/
|
||||
.webassets-cache
|
||||
|
||||
# Scrapy stuff:
|
||||
.scrapy
|
||||
|
||||
# Sphinx documentation
|
||||
docs/_build/
|
||||
|
||||
# PyBuilder
|
||||
target/
|
||||
|
||||
# Jupyter Notebook
|
||||
.ipynb_checkpoints
|
||||
|
||||
# IPython
|
||||
profile_default/
|
||||
ipython_config.py
|
||||
|
||||
# pyenv
|
||||
.python-version
|
||||
|
||||
# pipenv
|
||||
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
||||
# However, in case of collaboration, if having platform-specific dependencies or dependencies
|
||||
# having no cross-platform support, pipenv may install dependencies that don't work, or not
|
||||
# install all needed dependencies.
|
||||
#Pipfile.lock
|
||||
|
||||
# PEP 582; used by e.g. github.com/David-OConnor/pyflow
|
||||
__pypackages__/
|
||||
|
||||
# Celery stuff
|
||||
celerybeat-schedule
|
||||
celerybeat.pid
|
||||
|
||||
# SageMath parsed files
|
||||
*.sage.py
|
||||
|
||||
# Environments
|
||||
.env
|
||||
.venv
|
||||
env/
|
||||
venv/
|
||||
ENV/
|
||||
env.bak/
|
||||
venv.bak/
|
||||
|
||||
# Spyder project settings
|
||||
.spyderproject
|
||||
.spyproject
|
||||
|
||||
# Rope project settings
|
||||
.ropeproject
|
||||
|
||||
# mkdocs documentation
|
||||
/site
|
||||
|
||||
# mypy
|
||||
.mypy_cache/
|
||||
.dmypy.json
|
||||
dmypy.json
|
||||
|
||||
# Pyre type checker
|
||||
.pyre/
|
||||
|
||||
# pytype static type analyzer
|
||||
.pytype/
|
||||
|
||||
# End of https://www.toptal.com/developers/gitignore/api/python,latex
|
3
joint_paraphrase_model/README.md
Normal file
3
joint_paraphrase_model/README.md
Normal file
|
@ -0,0 +1,3 @@
|
|||
# joint_paraphrase_model
|
||||
|
||||
joint training paraphrase model --- neurips
|
112
joint_paraphrase_model/config.py
Normal file
112
joint_paraphrase_model/config.py
Normal file
|
@ -0,0 +1,112 @@
|
|||
import os
|
||||
import torch
|
||||
|
||||
|
||||
# general
|
||||
DEV = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
||||
|
||||
PAD = "<__PAD__>"
|
||||
UNK = "<__UNK__>"
|
||||
NOFIX = "<__NOFIX__>"
|
||||
SOS = "<__SOS__>"
|
||||
EOS = "<__EOS__>"
|
||||
|
||||
batch_size = 1
|
||||
teacher_forcing_ratio = 0.5
|
||||
embedding_dim = 300
|
||||
fix_hidden_dim = 128
|
||||
par_hidden_dim = 1024
|
||||
fix_dropout = 0.5
|
||||
par_dropout = 0.2
|
||||
_fix_learning_rate = 0.00001
|
||||
_par_learning_rate = 0.0001
|
||||
learning_rate = _par_learning_rate
|
||||
fix_momentum = 0.9
|
||||
par_momentum = 0.0
|
||||
max_length = 121
|
||||
epochs = 5
|
||||
|
||||
# paths
|
||||
data_path = "./data"
|
||||
provo_predictability_path = os.path.join(
|
||||
data_path, "datasets/provo/Provo_Corpus-Predictability_Norms.csv"
|
||||
)
|
||||
provo_eyetracking_path = os.path.join(
|
||||
data_path, "datasets/provo/Provo_Corpus-Eyetracking_Data.csv"
|
||||
)
|
||||
|
||||
geco_en_path = os.path.join(data_path, "datasets/geco/EnglishMaterial.csv")
|
||||
geco_mono_path = os.path.join(data_path, "datasets/geco/MonolingualReadingData.csv")
|
||||
|
||||
movieqa_human_path = os.path.join(data_path, "datasets/all_word_scores_fixations")
|
||||
movieqa_human_path_2 = os.path.join(
|
||||
data_path, "datasets/all_word_scores_fixations_exp2"
|
||||
)
|
||||
movieqa_human_path_3 = os.path.join(
|
||||
data_path, "datasets/all_word_scores_fixations_exp3"
|
||||
)
|
||||
movieqa_split_plot_path = os.path.join(data_path, "datasets/split_plot_UNRESOLVED")
|
||||
|
||||
cnn_path = os.path.join(
|
||||
data_path,
|
||||
"projects/2019/fixation_prediction/ez-reader-wrapper/predictability/output_cnn/",
|
||||
)
|
||||
dm_path = os.path.join(
|
||||
data_path,
|
||||
"projects/2019/fixation_prediction/ez-reader-wrapper/predictability/output_dm/",
|
||||
)
|
||||
|
||||
qqp_paws_basedir = os.path.join(data_path, "datasets/paw_google/qqp/paws_qqp/output")
|
||||
qqp_paws_train_path = os.path.join(qqp_paws_basedir, "train.tsv")
|
||||
qqp_paws_dev_path = os.path.join(qqp_paws_basedir, "dev.tsv")
|
||||
qqp_paws_test_path = os.path.join(qqp_paws_basedir, "test.tsv")
|
||||
|
||||
qqp_basedir = os.path.join(data_path, "datasets/Quora_question_pair_partition_OG")
|
||||
qqp_train_path = os.path.join(qqp_basedir, "train.tsv")
|
||||
qqp_dev_path = os.path.join(qqp_basedir, "dev.tsv")
|
||||
qqp_test_path = os.path.join(qqp_basedir, "test.tsv")
|
||||
|
||||
qqp_kag_basedir = os.path.join(data_path, "datasets/Quora_question_pair_partition_kag")
|
||||
qqp_kag_train_path = os.path.join(qqp_kag_basedir, "train.tsv")
|
||||
qqp_kag_dev_path = os.path.join(qqp_kag_basedir, "dev.tsv")
|
||||
qqp_kag_test_path = os.path.join(qqp_kag_basedir, "test.tsv")
|
||||
|
||||
wiki_basedir = os.path.join(data_path, "datasets/paw_google/wiki")
|
||||
wiki_train_path = os.path.join(wiki_basedir, "train.tsv")
|
||||
wiki_dev_path = os.path.join(wiki_basedir, "dev.tsv")
|
||||
wiki_test_path = os.path.join(wiki_basedir, "test.tsv")
|
||||
|
||||
msrpc_basedir = os.path.join(data_path, "datasets/MSRPC")
|
||||
msrpc_train_path = os.path.join(msrpc_basedir, "msr_paraphrase_train.txt")
|
||||
msrpc_dev_path = os.path.join(msrpc_basedir, "msr_paraphrase_dev.txt")
|
||||
msrpc_test_path = os.path.join(msrpc_basedir, "msr_paraphrase_test.txt")
|
||||
|
||||
sentiment_basedir = os.path.join(data_path, "datasets/sentiment_kag")
|
||||
sentiment_train_path = os.path.join(sentiment_basedir, "train.tsv")
|
||||
sentiment_dev_path = os.path.join(sentiment_basedir, "dev.tsv")
|
||||
sentiment_test_path = os.path.join(sentiment_basedir, "test.tsv")
|
||||
|
||||
tamil_basedir = os.path.join(data_path, "datasets/en-ta-parallel-v2")
|
||||
tamil_train_path = os.path.join(tamil_basedir, "corpus.bcn.train.enta")
|
||||
tamil_dev_path = os.path.join(tamil_basedir, "corpus.bcn.dev.enta")
|
||||
tamil_test_path = os.path.join(tamil_basedir, "corpus.bcn.test.enta")
|
||||
|
||||
compression_basedir = os.path.join(data_path, "datasets/sentence-compression/data")
|
||||
compression_train_path = os.path.join(compression_basedir, "train.tsv")
|
||||
compression_dev_path = os.path.join(compression_basedir, "dev.tsv")
|
||||
compression_test_path = os.path.join(compression_basedir, "test.tsv")
|
||||
|
||||
stanford_basedir = os.path.join(data_path, "datasets/stanfordSentimentTreebank")
|
||||
stanford_train_path = os.path.join(stanford_basedir, "train.tsv")
|
||||
stanford_dev_path = os.path.join(stanford_basedir, "dev.tsv")
|
||||
stanford_test_path = os.path.join(stanford_basedir, "test.tsv")
|
||||
|
||||
stanford_sent_basedir = os.path.join(data_path, "datasets/stanfordSentimentTreebank")
|
||||
stanford_sent_train_path = os.path.join(stanford_basedir, "train_sent.tsv")
|
||||
stanford_sent_dev_path = os.path.join(stanford_basedir, "dev_sent.tsv")
|
||||
stanford_sent_test_path = os.path.join(stanford_basedir, "test_sent.tsv")
|
||||
|
||||
|
||||
emb_path = os.path.join(data_path, "Google_word2vec/GoogleNews-vectors-negative300.bin")
|
||||
|
||||
glove_path = "glove.840B.300d.txt"
|
1
joint_paraphrase_model/data
Symbolic link
1
joint_paraphrase_model/data
Symbolic link
|
@ -0,0 +1 @@
|
|||
/netpool/work/gpu-2/users/soodea/
|
1
joint_paraphrase_model/glove.840B.300d.txt
Symbolic link
1
joint_paraphrase_model/glove.840B.300d.txt
Symbolic link
|
@ -0,0 +1 @@
|
|||
/netpool/work/gpu-2/users/soodea/datasets/glove/glove.840B.300d.txt
|
1
joint_paraphrase_model/glove.cache
Normal file
1
joint_paraphrase_model/glove.cache
Normal file
File diff suppressed because one or more lines are too long
0
joint_paraphrase_model/libs/__init__.py
Normal file
0
joint_paraphrase_model/libs/__init__.py
Normal file
416
joint_paraphrase_model/libs/corpora.py
Normal file
416
joint_paraphrase_model/libs/corpora.py
Normal file
|
@ -0,0 +1,416 @@
|
|||
import logging
|
||||
|
||||
import config
|
||||
|
||||
|
||||
def tokenize(sent):
|
||||
return sent.split(" ")
|
||||
|
||||
|
||||
class Lang:
|
||||
"""Represents the vocabulary
|
||||
"""
|
||||
def __init__(self, name):
|
||||
self.name = name
|
||||
self.word2index = {
|
||||
config.PAD: 0,
|
||||
config.UNK: 1,
|
||||
config.NOFIX: 2,
|
||||
config.SOS: 3,
|
||||
config.EOS: 4,
|
||||
}
|
||||
self.word2count = {}
|
||||
self.index2word = {
|
||||
0: config.PAD,
|
||||
1: config.UNK,
|
||||
2: config.NOFIX,
|
||||
3: config.SOS,
|
||||
4: config.EOS,
|
||||
}
|
||||
self.n_words = 5
|
||||
|
||||
def add_sentence(self, sentence):
|
||||
assert isinstance(
|
||||
sentence, (list, tuple)
|
||||
), "input to add_sentence must be tokenized"
|
||||
for word in sentence:
|
||||
self.add_word(word)
|
||||
|
||||
def add_word(self, word):
|
||||
if word not in self.word2index:
|
||||
self.word2index[word] = self.n_words
|
||||
self.word2count[word] = 1
|
||||
self.index2word[self.n_words] = word
|
||||
self.n_words += 1
|
||||
else:
|
||||
self.word2count[word] += 1
|
||||
|
||||
def __add__(self, other):
|
||||
"""Returns a new Lang object containing the vocabulary from this and
|
||||
the other Lang object
|
||||
"""
|
||||
new_lang = Lang(f"{self.name}_{other.name}")
|
||||
|
||||
# Add vocabulary from both Langs
|
||||
for word in self.word2count.keys():
|
||||
new_lang.add_word(word)
|
||||
for word in other.word2count.keys():
|
||||
new_lang.add_word(word)
|
||||
|
||||
# Fix the counts on the new one
|
||||
for word in new_lang.word2count.keys():
|
||||
new_lang.word2count[word] = self.word2count.get(
|
||||
word, 0
|
||||
) + other.word2count.get(word, 0)
|
||||
|
||||
return new_lang
|
||||
|
||||
|
||||
def load_wiki(split):
|
||||
"""Load the Wiki from PAWs"""
|
||||
logger = logging.getLogger(f"{__name__}.load_wiki")
|
||||
lang = Lang("wiki")
|
||||
|
||||
if split == "train":
|
||||
path = config.wiki_train_path
|
||||
elif split == "val":
|
||||
path = config.wiki_dev_path
|
||||
elif split == "test":
|
||||
path = config.wiki_test_path
|
||||
|
||||
logger.info("loading %s from %s" % (split, path))
|
||||
|
||||
pairs = []
|
||||
with open(path) as handle:
|
||||
|
||||
# skip header
|
||||
handle.readline()
|
||||
|
||||
for line in handle:
|
||||
_, sent1, sent2, rating = line.strip().split("\t")
|
||||
if rating == "0":
|
||||
continue
|
||||
sent1 = tokenize(sent1)
|
||||
sent2 = tokenize(sent2)
|
||||
lang.add_sentence(sent1)
|
||||
lang.add_sentence(sent2)
|
||||
|
||||
# pairs.append([sent1, sent2, rating])
|
||||
pairs.append([sent1, sent2])
|
||||
|
||||
# MS makes the vocab for paraphrase the same
|
||||
return pairs, lang
|
||||
|
||||
|
||||
def load_qqp_paws(split):
|
||||
"""Load the QQP from PAWs"""
|
||||
logger = logging.getLogger(f"{__name__}.load_qqp_paws")
|
||||
lang = Lang("qqp_paws")
|
||||
|
||||
if split == "train":
|
||||
path = config.qqp_paws_train_path
|
||||
elif split == "val":
|
||||
path = config.qqp_paws_dev_path
|
||||
elif split == "test":
|
||||
path = config.qqp_paws_test_path
|
||||
|
||||
logger.info("loading %s from %s" % (split, path))
|
||||
|
||||
pairs = []
|
||||
with open(path) as handle:
|
||||
|
||||
# skip header
|
||||
handle.readline()
|
||||
|
||||
for line in handle:
|
||||
_, sent1, sent2, rating = line.strip().split("\t")
|
||||
if rating == "0":
|
||||
continue
|
||||
sent1 = tokenize(sent1)
|
||||
sent2 = tokenize(sent2)
|
||||
lang.add_sentence(sent1)
|
||||
lang.add_sentence(sent2)
|
||||
|
||||
# pairs.append([sent1, sent2, rating])
|
||||
pairs.append([sent1, sent2])
|
||||
|
||||
# MS makes the vocab for paraphrase the same
|
||||
return pairs, lang
|
||||
|
||||
def load_qqp(split):
|
||||
"""Load the QQP from Original"""
|
||||
logger = logging.getLogger(f"{__name__}.load_qqp")
|
||||
lang = Lang("qqp")
|
||||
|
||||
if split == "train":
|
||||
path = config.qqp_train_path
|
||||
elif split == "val":
|
||||
path = config.qqp_dev_path
|
||||
elif split == "test":
|
||||
path = config.qqp_test_path
|
||||
|
||||
logger.info("loading %s from %s" % (split, path))
|
||||
|
||||
pairs = []
|
||||
with open(path) as handle:
|
||||
|
||||
# skip header
|
||||
handle.readline()
|
||||
|
||||
for line in handle:
|
||||
rating, sent1, sent2, _ = line.strip().split("\t")
|
||||
if rating == "0":
|
||||
continue
|
||||
sent1 = tokenize(sent1)
|
||||
sent2 = tokenize(sent2)
|
||||
lang.add_sentence(sent1)
|
||||
lang.add_sentence(sent2)
|
||||
|
||||
# pairs.append([sent1, sent2, rating])
|
||||
pairs.append([sent1, sent2])
|
||||
|
||||
# MS makes the vocab for paraphrase the same
|
||||
return pairs, lang
|
||||
|
||||
|
||||
def load_qqp_kag(split):
|
||||
"""Load the QQP from Kaggle""" #not original right now, expriemnting with kaggle 100K, 3K, 30K split
|
||||
logger = logging.getLogger(f"{__name__}.load_qqp_kag")
|
||||
lang = Lang("qqp_kag")
|
||||
|
||||
if split == "train":
|
||||
path = config.qqp_kag_train_path
|
||||
elif split == "val":
|
||||
path = config.qqp_kag_dev_path
|
||||
elif split == "test":
|
||||
path = config.qqp_kag_test_path
|
||||
|
||||
logger.info("loading %s from %s" % (split, path))
|
||||
|
||||
pairs = []
|
||||
with open(path) as handle:
|
||||
|
||||
# skip header
|
||||
handle.readline()
|
||||
|
||||
for line in handle: #when reading the kag version we do not have 4 fields, but rather 3
|
||||
rating, sent1, sent2 = line.strip().split("\t")
|
||||
if rating == "0":
|
||||
continue
|
||||
sent1 = tokenize(sent1)
|
||||
sent2 = tokenize(sent2)
|
||||
lang.add_sentence(sent1)
|
||||
lang.add_sentence(sent2)
|
||||
|
||||
# pairs.append([sent1, sent2, rating])
|
||||
pairs.append([sent1, sent2])
|
||||
|
||||
# MS makes the vocab for paraphrase the same
|
||||
return pairs, lang
|
||||
|
||||
|
||||
def load_msrpc(split):
|
||||
"""Load the Microsoft Research Paraphrase Corpus (MSRPC)"""
|
||||
logger = logging.getLogger(f"{__name__}.load_msrpc")
|
||||
lang = Lang("msrpc")
|
||||
|
||||
if split == "train":
|
||||
path = config.msrpc_train_path
|
||||
elif split == "val":
|
||||
path = config.msrpc_dev_path
|
||||
elif split == "test":
|
||||
path = config.msrpc_test_path
|
||||
|
||||
logger.info("loading %s from %s" % (split, path))
|
||||
|
||||
pairs = []
|
||||
with open(path) as handle:
|
||||
|
||||
# skip header
|
||||
handle.readline()
|
||||
|
||||
for line in handle:
|
||||
rating, _, _, sent1, sent2 = line.strip().split("\t")
|
||||
if rating == "0":
|
||||
continue
|
||||
sent1 = tokenize(sent1)
|
||||
sent2 = tokenize(sent2)
|
||||
lang.add_sentence(sent1)
|
||||
lang.add_sentence(sent2)
|
||||
|
||||
# pairs.append([sent1, sent2, rating])
|
||||
pairs.append([sent1, sent2])
|
||||
|
||||
# return src_lang, dst_lang, pairs
|
||||
# MS makes the vocab for paraphrase the same
|
||||
|
||||
return pairs, lang
|
||||
|
||||
def load_sentiment(split):
|
||||
"""Load the Sentiment Kaggle Comp Dataset"""
|
||||
logger = logging.getLogger(f"{__name__}.load_sentiment")
|
||||
lang = Lang("sentiment")
|
||||
|
||||
if split == "train":
|
||||
path = config.sentiment_train_path
|
||||
elif split == "val":
|
||||
path = config.sentiment_dev_path
|
||||
elif split == "test":
|
||||
path = config.sentiment_test_path
|
||||
|
||||
logger.info("loading %s from %s" % (split, path))
|
||||
|
||||
pairs = []
|
||||
|
||||
with open(path) as handle:
|
||||
|
||||
# skip header
|
||||
handle.readline()
|
||||
|
||||
for line in handle:
|
||||
_, _, sent1, sent2 = line.strip().split("\t")
|
||||
|
||||
sent1 = tokenize(sent1)
|
||||
sent2 = tokenize(sent2)
|
||||
lang.add_sentence(sent1)
|
||||
lang.add_sentence(sent2)
|
||||
|
||||
# pairs.append([sent1, sent2, rating])
|
||||
pairs.append([sent1, sent2])
|
||||
|
||||
return pairs, lang
|
||||
|
||||
|
||||
def load_tamil(split):
|
||||
"""Load the En to Tamil dataset, current SOTA ~13 bleu"""
|
||||
logger = logging.getLogger(f"{__name__}.load_tamil")
|
||||
lang = Lang("tamil")
|
||||
|
||||
if split == "train":
|
||||
path = config.tamil_train_path
|
||||
elif split == "val":
|
||||
path = config.tamil_dev_path
|
||||
elif split == "test":
|
||||
path = config.tamil_test_path
|
||||
|
||||
logger.info("loading %s from %s" % (split, path))
|
||||
|
||||
pairs = []
|
||||
with open(path) as handle:
|
||||
|
||||
handle.readline()
|
||||
|
||||
for line in handle:
|
||||
sent1, sent2 = line.strip().split("\t")
|
||||
#if rating == "0":
|
||||
# continue
|
||||
sent1 = tokenize(sent1)
|
||||
#I dunno how to tokenize tamil.....?
|
||||
sent2 = tokenize(sent2)
|
||||
lang.add_sentence(sent1)
|
||||
lang.add_sentence(sent2)
|
||||
|
||||
pairs.append([sent1, sent2])
|
||||
|
||||
return pairs, lang
|
||||
|
||||
def load_compression(split):
|
||||
"""Load the Google Sentence Compression Dataset"""
|
||||
logger = logging.getLogger(f"{__name__}.load_compression")
|
||||
lang = Lang("compression")
|
||||
|
||||
if split == "train":
|
||||
path = config.compression_train_path
|
||||
elif split == "val":
|
||||
path = config.compression_dev_path
|
||||
elif split == "test":
|
||||
path = config.compression_test_path
|
||||
|
||||
logger.info("loading %s from %s" % (split, path))
|
||||
|
||||
pairs = []
|
||||
with open(path) as handle:
|
||||
|
||||
handle.readline()
|
||||
|
||||
for line in handle:
|
||||
sent1, sent2 = line.strip().split("\t")
|
||||
sent1 = tokenize(sent1)
|
||||
sent2 = tokenize(sent2)
|
||||
# print(len(sent1), sent1)
|
||||
# print(len(sent2), sent2)
|
||||
# print()
|
||||
lang.add_sentence(sent1)
|
||||
lang.add_sentence(sent2)
|
||||
|
||||
pairs.append([sent1, sent2])
|
||||
|
||||
return pairs, lang
|
||||
|
||||
def load_stanford(split):
|
||||
"""Load the Stanford Sentiment Dataset phrases"""
|
||||
logger = logging.getLogger(f"{__name__}.load_stanford")
|
||||
lang = Lang("stanford")
|
||||
|
||||
if split == "train":
|
||||
path = config.stanford_train_path
|
||||
elif split == "val":
|
||||
path = config.stanford_dev_path
|
||||
elif split == "test":
|
||||
path = config.stanford_test_path
|
||||
|
||||
logger.info("loading %s from %s" % (split, path))
|
||||
|
||||
pairs = []
|
||||
|
||||
with open(path) as handle:
|
||||
|
||||
# skip header
|
||||
#handle.readline()
|
||||
|
||||
for line in handle:
|
||||
_, _, sent1, sent2 = line.strip().split("\t")
|
||||
|
||||
sent1 = tokenize(sent1)
|
||||
sent2 = tokenize(sent2)
|
||||
lang.add_sentence(sent1)
|
||||
lang.add_sentence(sent2)
|
||||
|
||||
# pairs.append([sent1, sent2, rating])
|
||||
pairs.append([sent1, sent2])
|
||||
|
||||
return pairs, lang
|
||||
|
||||
def load_stanford_sent(split):
|
||||
"""Load the Stanford Sentiment Dataset sentences"""
|
||||
logger = logging.getLogger(f"{__name__}.load_stanford_sent")
|
||||
lang = Lang("stanford_sent")
|
||||
|
||||
if split == "train":
|
||||
path = config.stanford_sent_train_path
|
||||
elif split == "val":
|
||||
path = config.stanford_sent_dev_path
|
||||
elif split == "test":
|
||||
path = config.stanford_sent_test_path
|
||||
|
||||
logger.info("loading %s from %s" % (split, path))
|
||||
|
||||
pairs = []
|
||||
|
||||
with open(path) as handle:
|
||||
|
||||
# skip header
|
||||
#handle.readline()
|
||||
|
||||
for line in handle:
|
||||
_, _, sent1, sent2 = line.strip().split("\t")
|
||||
|
||||
sent1 = tokenize(sent1)
|
||||
sent2 = tokenize(sent2)
|
||||
lang.add_sentence(sent1)
|
||||
lang.add_sentence(sent2)
|
||||
|
||||
# pairs.append([sent1, sent2, rating])
|
||||
pairs.append([sent1, sent2])
|
||||
|
||||
return pairs, lang
|
|
@ -0,0 +1 @@
|
|||
from .main import *
|
131
joint_paraphrase_model/libs/fixation_generation/main.py
Normal file
131
joint_paraphrase_model/libs/fixation_generation/main.py
Normal file
|
@ -0,0 +1,131 @@
|
|||
from collections import OrderedDict
|
||||
import logging
|
||||
import sys
|
||||
|
||||
from .self_attention import Transformer
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
import torch.nn.functional as F
|
||||
from torch.nn.utils.rnn import pack_sequence, pack_padded_sequence, pad_packed_sequence
|
||||
|
||||
|
||||
def random_embedding(vocab_size, embedding_dim):
|
||||
pretrain_emb = np.empty([vocab_size, embedding_dim])
|
||||
scale = np.sqrt(3.0 / embedding_dim)
|
||||
for index in range(vocab_size):
|
||||
pretrain_emb[index, :] = np.random.uniform(-scale, scale, [1, embedding_dim])
|
||||
return pretrain_emb
|
||||
|
||||
|
||||
def neg_log_likelihood_loss(outputs, batch_label, batch_size, seq_len):
|
||||
outputs = outputs.view(batch_size * seq_len, -1)
|
||||
score = F.log_softmax(outputs, 1)
|
||||
|
||||
loss = nn.NLLLoss(ignore_index=0, size_average=False)(
|
||||
score, batch_label.view(batch_size * seq_len)
|
||||
)
|
||||
loss = loss / batch_size
|
||||
_, tag_seq = torch.max(score, 1)
|
||||
tag_seq = tag_seq.view(batch_size, seq_len)
|
||||
|
||||
# print(score[0], tag_seq[0])
|
||||
|
||||
return loss, tag_seq
|
||||
|
||||
|
||||
def mse_loss(outputs, batch_label, batch_size, seq_len, word_seq_length):
|
||||
# score = torch.nn.functional.softmax(outputs, 1)
|
||||
score = torch.sigmoid(outputs)
|
||||
|
||||
mask = torch.zeros_like(score)
|
||||
for i, v in enumerate(word_seq_length):
|
||||
mask[i, 0:v] = 1
|
||||
|
||||
score = score * mask
|
||||
|
||||
loss = nn.MSELoss(reduction="sum")(
|
||||
score.view(batch_size, seq_len), batch_label.view(batch_size, seq_len)
|
||||
)
|
||||
|
||||
loss = loss / batch_size
|
||||
|
||||
return loss, score.view(batch_size, seq_len)
|
||||
|
||||
|
||||
class Network(nn.Module):
|
||||
def __init__(
|
||||
self,
|
||||
embedding_type,
|
||||
vocab_size,
|
||||
embedding_dim,
|
||||
dropout,
|
||||
hidden_dim,
|
||||
embeddings=None,
|
||||
attention=True,
|
||||
):
|
||||
super().__init__()
|
||||
self.logger = logging.getLogger(f"{__name__}")
|
||||
prelayers = OrderedDict()
|
||||
postlayers = OrderedDict()
|
||||
|
||||
if embedding_type in ("w2v", "glove"):
|
||||
if embeddings is not None:
|
||||
prelayers["embedding_layer"] = nn.Embedding.from_pretrained(embeddings)
|
||||
else:
|
||||
prelayers["embedding_layer"] = nn.Embedding(vocab_size, embedding_dim)
|
||||
prelayers["embedding_dropout_layer"] = nn.Dropout(dropout)
|
||||
embedding_dim = 300
|
||||
elif embedding_type == "bert":
|
||||
embedding_dim = 768
|
||||
|
||||
self.lstm = BiLSTM(embedding_dim, hidden_dim // 2, num_layers=1)
|
||||
postlayers["lstm_dropout_layer"] = nn.Dropout(dropout)
|
||||
|
||||
if attention:
|
||||
# increased compl with 1024D, and 16,16: for no att and att experiments
|
||||
# before: for the initial att and pretraining: heads 4 and layers 4, 128D
|
||||
# then was 128 D with heads 4 layer 1 = results for all IUI
|
||||
###postlayers["position_encodings"] = PositionalEncoding(hidden_dim)
|
||||
postlayers["attention_layer"] = Transformer(
|
||||
d_model=hidden_dim, n_heads=4, n_layers=1
|
||||
)
|
||||
|
||||
postlayers["ff_layer"] = nn.Linear(hidden_dim, hidden_dim // 2)
|
||||
postlayers["ff_activation"] = nn.ReLU()
|
||||
postlayers["output_layer"] = nn.Linear(hidden_dim // 2, 1)
|
||||
|
||||
self.logger.info(f"prelayers: {prelayers.keys()}")
|
||||
self.logger.info(f"postlayers: {postlayers.keys()}")
|
||||
|
||||
self.pre = nn.Sequential(prelayers)
|
||||
self.post = nn.Sequential(postlayers)
|
||||
|
||||
def forward(self, x, word_seq_length):
|
||||
x = self.pre(x)
|
||||
x = self.lstm(x, word_seq_length)
|
||||
#MS pritning fix model params
|
||||
#for p in self.parameters():
|
||||
# print(p.data)
|
||||
# break
|
||||
|
||||
return self.post(x.transpose(1, 0))
|
||||
|
||||
|
||||
class BiLSTM(nn.Module):
|
||||
def __init__(self, embedding_dim, lstm_hidden, num_layers):
|
||||
super().__init__()
|
||||
self.net = nn.LSTM(
|
||||
input_size=embedding_dim,
|
||||
hidden_size=lstm_hidden,
|
||||
num_layers=num_layers,
|
||||
batch_first=True,
|
||||
bidirectional=True,
|
||||
)
|
||||
|
||||
def forward(self, x, word_seq_length):
|
||||
packed_words = pack_padded_sequence(x, word_seq_length, True, False)
|
||||
lstm_out, hidden = self.net(packed_words)
|
||||
lstm_out, _ = pad_packed_sequence(lstm_out)
|
||||
return lstm_out
|
|
@ -0,0 +1,131 @@
|
|||
import torch
|
||||
import torch.nn as nn
|
||||
import torch.nn.functional as F
|
||||
|
||||
import numpy as np
|
||||
|
||||
import math
|
||||
|
||||
|
||||
class PositionalEncoding(nn.Module):
|
||||
def __init__(self, d_hid, n_position=200):
|
||||
super(PositionalEncoding, self).__init__()
|
||||
|
||||
# Not a parameter
|
||||
self.register_buffer('pos_table', self._get_sinusoid_encoding_table(n_position, d_hid))
|
||||
|
||||
def _get_sinusoid_encoding_table(self, n_position, d_hid):
|
||||
''' Sinusoid position encoding table '''
|
||||
# TODO: make it with torch instead of numpy
|
||||
|
||||
def get_position_angle_vec(position):
|
||||
return [position / np.power(10000, 2 * (hid_j // 2) / d_hid) for hid_j in range(d_hid)]
|
||||
|
||||
sinusoid_table = np.array([get_position_angle_vec(pos_i) for pos_i in range(n_position)])
|
||||
sinusoid_table[:, 0::2] = np.sin(sinusoid_table[:, 0::2]) # dim 2i
|
||||
sinusoid_table[:, 1::2] = np.cos(sinusoid_table[:, 1::2]) # dim 2i+1
|
||||
|
||||
return torch.FloatTensor(sinusoid_table).unsqueeze(0)
|
||||
|
||||
def forward(self, x):
|
||||
return x + self.pos_table[:, :x.size(1)].clone().detach()
|
||||
|
||||
|
||||
class AttentionLayer(nn.Module):
|
||||
def __init__(self):
|
||||
super(AttentionLayer, self).__init__()
|
||||
|
||||
def forward(self, Q, K, V):
|
||||
# Q: float32:[batch_size, n_queries, d_k]
|
||||
# K: float32:[batch_size, n_keys, d_k]
|
||||
# V: float32:[batch_size, n_keys, d_v]
|
||||
dk = K.shape[-1]
|
||||
dv = V.shape[-1]
|
||||
KT = torch.transpose(K, -1, -2)
|
||||
weight_logits = torch.bmm(Q, KT) / math.sqrt(dk)
|
||||
# weight_logits: float32[batch_size, n_queries, n_keys]
|
||||
weights = F.softmax(weight_logits, dim=-1)
|
||||
# weight: float32[batch_size, n_queries, n_keys]
|
||||
return torch.bmm(weights, V)
|
||||
# return float32[batch_size, n_queries, dv]
|
||||
|
||||
|
||||
class MultiHeadedSelfAttentionLayer(nn.Module):
|
||||
def __init__(self, d_model, n_heads):
|
||||
super(MultiHeadedSelfAttentionLayer, self).__init__()
|
||||
self.d_model = d_model
|
||||
self.n_heads = n_heads
|
||||
print('{} {}'.format(d_model, n_heads))
|
||||
assert d_model % n_heads == 0
|
||||
self.d_k = d_model // n_heads
|
||||
self.d_v = self.d_k
|
||||
self.attention_layer = AttentionLayer()
|
||||
self.W_Qs = nn.ModuleList([
|
||||
nn.Linear(d_model, self.d_k, bias=False)
|
||||
for _ in range(n_heads)
|
||||
])
|
||||
self.W_Ks = nn.ModuleList([
|
||||
nn.Linear(d_model, self.d_k, bias=False)
|
||||
for _ in range(n_heads)
|
||||
])
|
||||
self.W_Vs = nn.ModuleList([
|
||||
nn.Linear(d_model, self.d_v, bias=False)
|
||||
for _ in range(n_heads)
|
||||
])
|
||||
self.W_O = nn.Linear(d_model, d_model, bias=False)
|
||||
|
||||
def forward(self, x):
|
||||
# x:float32[batch_size, sequence_length, self.d_model]
|
||||
head_outputs = []
|
||||
for W_Q, W_K, W_V in zip(self.W_Qs, self.W_Ks, self.W_Vs):
|
||||
Q = W_Q(x)
|
||||
# Q float32:[batch_size, sequence_length, self.d_k]
|
||||
K = W_K(x)
|
||||
# Q float32:[batch_size, sequence_length, self.d_k]
|
||||
V = W_V(x)
|
||||
# Q float32:[batch_size, sequence_length, self.d_v]
|
||||
head_output = self.attention_layer(Q, K, V)
|
||||
# float32:[batch_size, sequence_length, self.d_v]
|
||||
head_outputs.append(head_output)
|
||||
concatenated = torch.cat(head_outputs, dim=-1)
|
||||
# concatenated float32:[batch_size, sequence_length, self.d_model]
|
||||
out = self.W_O(concatenated)
|
||||
# out float32:[batch_size, sequence_length, self.d_model]
|
||||
return out
|
||||
|
||||
class Feedforward(nn.Module):
|
||||
def __init__(self, d_model):
|
||||
super(Feedforward, self).__init__()
|
||||
self.d_model = d_model
|
||||
self.W1 = nn.Linear(d_model, d_model)
|
||||
self.W2 = nn.Linear(d_model, d_model)
|
||||
|
||||
def forward(self, x):
|
||||
# x: float32[batch_size, sequence_length, d_model]
|
||||
return self.W2(torch.relu(self.W1(x)))
|
||||
|
||||
class Transformer(nn.Module):
|
||||
def __init__(self, d_model, n_heads, n_layers):
|
||||
super(Transformer, self).__init__()
|
||||
self.d_model = d_model
|
||||
self.n_heads = n_heads
|
||||
self.n_layers = n_layers
|
||||
self.attention_layers = nn.ModuleList([
|
||||
MultiHeadedSelfAttentionLayer(d_model, n_heads)
|
||||
for _ in range(n_layers)
|
||||
])
|
||||
self.ffs = nn.ModuleList([
|
||||
Feedforward(d_model)
|
||||
for _ in range(n_layers)
|
||||
])
|
||||
|
||||
def forward(self, x):
|
||||
# x: float32[batch_size, sequence_length, self.d_model]
|
||||
for attention_layer, ff in zip(self.attention_layers, self.ffs):
|
||||
attention_out = attention_layer(x)
|
||||
# attention_out: float32[batch_size, sequence_length, self.d_model]
|
||||
x = F.layer_norm(x + attention_out, x.shape[2:])
|
||||
ff_out = ff(x)
|
||||
# ff_out: float32[batch_size, sequence_length, self.d_model]
|
||||
x = F.layer_norm(x + ff_out, x.shape[2:])
|
||||
return x
|
|
@ -0,0 +1 @@
|
|||
from .main import *
|
86
joint_paraphrase_model/libs/paraphrase_generation/main.py
Normal file
86
joint_paraphrase_model/libs/paraphrase_generation/main.py
Normal file
|
@ -0,0 +1,86 @@
|
|||
import json
|
||||
import math
|
||||
import os
|
||||
|
||||
import random
|
||||
import time
|
||||
|
||||
import matplotlib.pyplot as plt
|
||||
import matplotlib.ticker as ticker
|
||||
import numpy as np
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
import torch.nn.functional as F
|
||||
import torch.optim as optim
|
||||
|
||||
|
||||
class EncoderRNN(nn.Module):
|
||||
def __init__(self, input_size, hidden_size, embeddings):
|
||||
super(EncoderRNN, self).__init__()
|
||||
self.hidden_size = hidden_size
|
||||
|
||||
self.embedding = nn.Embedding.from_pretrained(embeddings)
|
||||
self.gru = nn.GRU(input_size, hidden_size)
|
||||
|
||||
def forward(self, input, hidden):
|
||||
embedded = self.embedding(input).view(1, 1, -1)
|
||||
output = embedded
|
||||
output, hidden = self.gru(output, hidden)
|
||||
return output, hidden
|
||||
|
||||
def initHidden(self):
|
||||
return torch.zeros(1, 1, self.hidden_size)
|
||||
|
||||
class AttnDecoderRNN(nn.Module):
|
||||
def __init__(
|
||||
self,
|
||||
input_size,
|
||||
hidden_size,
|
||||
output_size,
|
||||
embeddings,
|
||||
dropout_p,
|
||||
max_length,
|
||||
):
|
||||
super(AttnDecoderRNN, self).__init__()
|
||||
self.input_size = input_size
|
||||
self.hidden_size = hidden_size
|
||||
self.output_size = output_size
|
||||
self.dropout_p = dropout_p
|
||||
self.max_length = max_length
|
||||
|
||||
self.embedding = nn.Embedding.from_pretrained(embeddings) #for paragen
|
||||
#self.embedding = nn.Embedding(len(embeddings), 300) #for NMT with tamil, trying wiht senitment too
|
||||
self.attn = nn.Linear(self.input_size + self.hidden_size, self.max_length)
|
||||
self.attn_combine = nn.Linear(
|
||||
self.input_size + self.hidden_size, self.hidden_size
|
||||
)
|
||||
self.dropout = nn.Dropout(self.dropout_p)
|
||||
self.gru = nn.GRU(self.hidden_size, self.hidden_size)
|
||||
self.out = nn.Linear(self.hidden_size, self.output_size)
|
||||
|
||||
def forward(self, input, hidden, encoder_outputs, fixations):
|
||||
embedded = self.embedding(input).view(1, 1, -1)
|
||||
embedded = self.dropout(embedded)
|
||||
|
||||
attn_weights = F.softmax(
|
||||
self.attn(torch.cat((embedded[0], hidden[0]), 1)), dim=1
|
||||
)
|
||||
|
||||
attn_weights = attn_weights * torch.nn.ConstantPad1d((0, attn_weights.shape[-1] - fixations.shape[-2]), 0)(fixations.squeeze().unsqueeze(0))
|
||||
|
||||
# attn_weights = torch.softmax(attn_weights * torch.nn.ConstantPad1d((0, attn_weights.shape[-1] - fixations.shape[-2]), 0)(fixations.squeeze().unsqueeze(0)), dim=1)
|
||||
|
||||
attn_applied = torch.bmm(
|
||||
attn_weights.unsqueeze(0), encoder_outputs.unsqueeze(0)
|
||||
)
|
||||
|
||||
output = torch.cat((embedded[0], attn_applied[0]), 1)
|
||||
output = self.attn_combine(output).unsqueeze(0)
|
||||
|
||||
output = F.relu(output)
|
||||
output, hidden = self.gru(output, hidden)
|
||||
|
||||
# output = F.log_softmax(self.out(output[0]), dim=1)
|
||||
output = self.out(output[0])
|
||||
# output = F.log_softmax(output, dim=1)
|
||||
return output, hidden, attn_weights
|
225
joint_paraphrase_model/libs/utils.py
Normal file
225
joint_paraphrase_model/libs/utils.py
Normal file
|
@ -0,0 +1,225 @@
|
|||
import json
|
||||
import logging
|
||||
import math
|
||||
import os
|
||||
import random
|
||||
import re
|
||||
import time
|
||||
|
||||
import matplotlib.pyplot as plt
|
||||
import matplotlib.ticker as ticker
|
||||
from nltk.translate.bleu_score import sentence_bleu
|
||||
import numpy as np
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
|
||||
import config
|
||||
|
||||
|
||||
plt.switch_backend("agg")
|
||||
|
||||
|
||||
def load_glove(vocabulary):
|
||||
logger = logging.getLogger(f"{__name__}.load_glove")
|
||||
logger.info("loading embeddings")
|
||||
try:
|
||||
with open(f"glove.cache") as h:
|
||||
cache = json.load(h)
|
||||
except:
|
||||
logger.info("cache doesn't exist")
|
||||
cache = {}
|
||||
cache[config.PAD] = [0] * 300
|
||||
cache[config.SOS] = [0] * 300
|
||||
cache[config.EOS] = [0] * 300
|
||||
cache[config.UNK] = [0] * 300
|
||||
cache[config.NOFIX] = [0] * 300
|
||||
else:
|
||||
logger.info("cache found")
|
||||
|
||||
cache_miss = False
|
||||
|
||||
if not set(vocabulary) <= set(cache):
|
||||
cache_miss = True
|
||||
logger.warn("cache miss, loading full embeddings")
|
||||
data = {}
|
||||
with open("glove.840B.300d.txt") as h:
|
||||
for line in h:
|
||||
word, *emb = line.strip().split()
|
||||
try:
|
||||
data[word] = [float(x) for x in emb]
|
||||
except:
|
||||
continue
|
||||
logger.info("finished loading full embeddings")
|
||||
for word in vocabulary:
|
||||
try:
|
||||
cache[word] = data[word]
|
||||
except KeyError:
|
||||
cache[word] = [0] * 300
|
||||
logger.info("cache updated")
|
||||
|
||||
embeddings = []
|
||||
for word in vocabulary:
|
||||
embeddings.append(torch.tensor(cache[word], dtype=torch.float32))
|
||||
embeddings = torch.stack(embeddings)
|
||||
|
||||
if cache_miss:
|
||||
with open(f"glove.cache", "w") as h:
|
||||
json.dump(cache, h)
|
||||
logger.info("cache saved")
|
||||
|
||||
return embeddings
|
||||
|
||||
|
||||
def tokenize(s):
|
||||
s = s.lower().strip()
|
||||
s = re.sub(r"([.!?])", r" \1", s)
|
||||
s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
|
||||
s = s.split(" ")
|
||||
return s
|
||||
|
||||
|
||||
def indices_from_sentence(word2index, sentence, unknown_threshold):
|
||||
if unknown_threshold:
|
||||
return [
|
||||
word2index.get(
|
||||
word if random.random() > unknown_threshold else config.UNK,
|
||||
word2index[config.UNK],
|
||||
)
|
||||
for word in sentence
|
||||
]
|
||||
else:
|
||||
return [
|
||||
word2index.get(word, word2index[config.UNK]) for word in sentence
|
||||
]
|
||||
|
||||
|
||||
def tensor_from_sentence(word2index, sentence, unknown_threshold):
|
||||
# indices = [config.SOS]
|
||||
indices = indices_from_sentence(word2index, sentence, unknown_threshold)
|
||||
indices.append(word2index[config.EOS])
|
||||
return torch.tensor(indices, dtype=torch.long, device=config.DEV)
|
||||
|
||||
|
||||
def tensors_from_pair(word2index, pair, shuffle, unknown_threshold):
|
||||
tensors = [
|
||||
tensor_from_sentence(word2index, pair[0], unknown_threshold),
|
||||
tensor_from_sentence(word2index, pair[1], unknown_threshold),
|
||||
]
|
||||
if shuffle:
|
||||
random.shuffle(tensors)
|
||||
return tensors
|
||||
|
||||
|
||||
def bleu(reference, hypothesis, n=4): #not sure if this actually changes the n gram
|
||||
if n < 1:
|
||||
return 0
|
||||
weights = [1/n]*n
|
||||
return sentence_bleu([reference], hypothesis, weights)
|
||||
|
||||
|
||||
def pair_iter(pairs, word2index, shuffle=False, shuffle_pairs=False, unknown_threshold=0.00):
|
||||
if shuffle:
|
||||
pairs = pairs.copy()
|
||||
random.shuffle(pairs)
|
||||
for pair in pairs:
|
||||
tensor1, tensor2 = tensors_from_pair(word2index, (pair[0], pair[1]), shuffle_pairs, unknown_threshold)
|
||||
yield (tensor1,), (tensor2,)
|
||||
|
||||
|
||||
def sent_iter(sents, word2index, unknown_threshold=0.00):
|
||||
for sent in sents:
|
||||
tensor = tensor_from_sentence(word2index, sent, unknown_threshold)
|
||||
yield (tensor,)
|
||||
|
||||
|
||||
def batch_iter(pairs, word2index, batch_size, shuffle=False, unknown_threshold=0.00):
|
||||
for i in range(len(pairs) // batch_size):
|
||||
batch = pairs[i : i + batch_size]
|
||||
if len(batch) != batch_size:
|
||||
continue
|
||||
batch_tensors = [
|
||||
tensors_from_pair(word2index, (pair[0], pair[1]), shuffle, unknown_threshold)
|
||||
for pair in batch
|
||||
]
|
||||
|
||||
tensors1, tensors2 = zip(*batch_tensors)
|
||||
|
||||
# targets = torch.tensor(targets, dtype=torch.long, device=config.DEV)
|
||||
|
||||
# tensors1_lengths = [len(t) for t in tensors1]
|
||||
# tensors2_lengths = [len(t) for t in tensors2]
|
||||
|
||||
# tensors1 = nn.utils.rnn.pack_sequence(tensors1, enforce_sorted=False)
|
||||
# tensors2 = nn.utils.rnn.pack_sequence(tensors2, enforce_sorted=False)
|
||||
|
||||
yield tensors1, tensors2
|
||||
|
||||
|
||||
def asMinutes(s):
|
||||
m = math.floor(s / 60)
|
||||
s -= m * 60
|
||||
return "%dm %ds" % (m, s)
|
||||
|
||||
|
||||
def timeSince(since, percent):
|
||||
now = time.time()
|
||||
s = now - since
|
||||
es = s / (percent)
|
||||
rs = es - s
|
||||
return "%s (- %s)" % (asMinutes(s), asMinutes(rs))
|
||||
|
||||
|
||||
def showPlot(points):
|
||||
plt.figure()
|
||||
fig, ax = plt.subplots()
|
||||
# this locator puts ticks at regular intervals
|
||||
loc = ticker.MultipleLocator(base=0.2)
|
||||
ax.yaxis.set_major_locator(loc)
|
||||
plt.plot(points)
|
||||
|
||||
|
||||
def showAttention(input_sentence, output_words, attentions):
|
||||
# Set up figure with colorbar
|
||||
fig = plt.figure()
|
||||
ax = fig.add_subplot(111)
|
||||
cax = ax.matshow(attentions.numpy(), cmap="bone")
|
||||
fig.colorbar(cax)
|
||||
|
||||
# Set up axes
|
||||
ax.set_xticklabels([""] + input_sentence.split(" ") + ["<__EOS__>"], rotation=90)
|
||||
ax.set_yticklabels([""] + output_words)
|
||||
|
||||
# Show label at every tick
|
||||
ax.xaxis.set_major_locator(ticker.MultipleLocator(1))
|
||||
ax.yaxis.set_major_locator(ticker.MultipleLocator(1))
|
||||
|
||||
plt.show()
|
||||
|
||||
|
||||
def evaluateAndShowAttention(input_sentence):
|
||||
output_words, attentions = evaluate(encoder1, attn_decoder1, input_sentence)
|
||||
print("input =", input_sentence)
|
||||
print("output =", " ".join(output_words))
|
||||
showAttention(input_sentence, output_words, attentions)
|
||||
|
||||
|
||||
def save_model(model, |