Add NLP task models

2020-12-08 21:10:52 +01:00 · 2020-12-08 21:10:52 +01:00 · 69f6de0ace
commit 69f6de0ace
parent d8beb17dfb
46 changed files with 4976 additions and 0 deletions
--- a/joint_paraphrase_model/.gitignore
+++ b/joint_paraphrase_model/.gitignore
@ -0,0 +1,428 @@
 # Created by https://www.toptal.com/developers/gitignore/api/python,latex
 # Edit at https://www.toptal.com/developers/gitignore?templates=python,latex
 ### LaTeX ###
 ## Core latex/pdflatex auxiliary files:
 *.aux
 *.lof
 *.log
 *.lot
 *.fls
 *.out
 *.toc
 *.fmt
 *.fot
 *.cb
 *.cb2
 .*.lb
 ## Intermediate documents:
 *.dvi
 *.xdv
 *-converted-to.*
 # these rules might exclude image files for figures etc.
 # *.ps
 # *.eps
 # *.pdf
 ## Generated if empty string is given at "Please type another file name for output:"
 .pdf
 ## Bibliography auxiliary files (bibtex/biblatex/biber):
 *.bbl
 *.bcf
 *.blg
 *-blx.aux
 *-blx.bib
 *.run.xml
 ## Build tool auxiliary files:
 *.fdb_latexmk
 *.synctex
 *.synctex(busy)
 *.synctex.gz
 *.synctex.gz(busy)
 *.pdfsync
 ## Build tool directories for auxiliary files
 # latexrun
 latex.out/
 ## Auxiliary and intermediate files from other packages:
 # algorithms
 *.alg
 *.loa
 # achemso
 acs-*.bib
 # amsthm
 *.thm
 # beamer
 *.nav
 *.pre
 *.snm
 *.vrb
 # changes
 *.soc
 # comment
 *.cut
 # cprotect
 *.cpt
 # elsarticle (documentclass of Elsevier journals)
 *.spl
 # endnotes
 *.ent
 # fixme
 *.lox
 # feynmf/feynmp
 *.mf
 *.mp
 *.t[1-9]
 *.t[1-9][0-9]
 *.tfm
 #(r)(e)ledmac/(r)(e)ledpar
 *.end
 *.?end
 *.[1-9]
 *.[1-9][0-9]
 *.[1-9][0-9][0-9]
 *.[1-9]R
 *.[1-9][0-9]R
 *.[1-9][0-9][0-9]R
 *.eledsec[1-9]
 *.eledsec[1-9]R
 *.eledsec[1-9][0-9]
 *.eledsec[1-9][0-9]R
 *.eledsec[1-9][0-9][0-9]
 *.eledsec[1-9][0-9][0-9]R
 # glossaries
 *.acn
 *.acr
 *.glg
 *.glo
 *.gls
 *.glsdefs
 *.lzo
 *.lzs
 # uncomment this for glossaries-extra (will ignore makeindex's style files!)
 # *.ist
 # gnuplottex
 *-gnuplottex-*
 # gregoriotex
 *.gaux
 *.gtex
 # htlatex
 *.4ct
 *.4tc
 *.idv
 *.lg
 *.trc
 *.xref
 # hyperref
 *.brf
 # knitr
 *-concordance.tex
 # TODO Comment the next line if you want to keep your tikz graphics files
 *.tikz
 *-tikzDictionary
 # listings
 *.lol
 # luatexja-ruby
 *.ltjruby
 # makeidx
 *.idx
 *.ilg
 *.ind
 # minitoc
 *.maf
 *.mlf
 *.mlt
 *.mtc[0-9]*
 *.slf[0-9]*
 *.slt[0-9]*
 *.stc[0-9]*
 # minted
 _minted*
 *.pyg
 # morewrites
 *.mw
 # nomencl
 *.nlg
 *.nlo
 *.nls
 # pax
 *.pax
 # pdfpcnotes
 *.pdfpc
 # sagetex
 *.sagetex.sage
 *.sagetex.py
 *.sagetex.scmd
 # scrwfile
 *.wrt
 # sympy
 *.sout
 *.sympy
 sympy-plots-for-*.tex/
 # pdfcomment
 *.upa
 *.upb
 # pythontex
 *.pytxcode
 pythontex-files-*/
 # tcolorbox
 *.listing
 # thmtools
 *.loe
 # TikZ & PGF
 *.dpth
 *.md5
 *.auxlock
 # todonotes
 *.tdo
 # vhistory
 *.hst
 *.ver
 # easy-todo
 *.lod
 # xcolor
 *.xcp
 # xmpincl
 *.xmpi
 # xindy
 *.xdy
 # xypic precompiled matrices and outlines
 *.xyc
 *.xyd
 # endfloat
 *.ttt
 *.fff
 # Latexian
 TSWLatexianTemp*
 ## Editors:
 # WinEdt
 *.bak
 *.sav
 # Texpad
 .texpadtmp
 # LyX
 *.lyx~
 # Kile
 *.backup
 # gummi
 .*.swp
 # KBibTeX
 *~[0-9]*
 # TeXnicCenter
 *.tps
 # auto folder when using emacs and auctex
 ./auto/*
 *.el
 # expex forward references with \gathertags
 *-tags.tex
 # standalone packages
 *.sta
 # Makeindex log files
 *.lpz
 # REVTeX puts footnotes in the bibliography by default, unless the nofootinbib
 # option is specified. Footnotes are the stored in a file with suffix Notes.bib.
 # Uncomment the next line to have this generated file ignored.
 #*Notes.bib
 ### LaTeX Patch ###
 # LIPIcs / OASIcs
 *.vtc
 # glossaries
 *.glstex
 ### Python ###
 # Byte-compiled / optimized / DLL files
 __pycache__/
 *.py[cod]
 *$py.class
 # C extensions
 *.so
 # Distribution / packaging
 .Python
 build/
 develop-eggs/
 dist/
 downloads/
 eggs/
 .eggs/
 lib/
 lib64/
 parts/
 sdist/
 var/
 wheels/
 pip-wheel-metadata/
 share/python-wheels/
 *.egg-info/
 .installed.cfg
 *.egg
 MANIFEST
 # PyInstaller
 #  Usually these files are written by a python script from a template
 #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 *.manifest
 *.spec
 # Installer logs
 pip-log.txt
 pip-delete-this-directory.txt
 # Unit test / coverage reports
 htmlcov/
 .tox/
 .nox/
 .coverage
 .coverage.*
 .cache
 nosetests.xml
 coverage.xml
 *.cover
 *.py,cover
 .hypothesis/
 .pytest_cache/
 # Translations
 *.mo
 *.pot
 # Django stuff:
 local_settings.py
 db.sqlite3
 db.sqlite3-journal
 # Flask stuff:
 instance/
 .webassets-cache
 # Scrapy stuff:
 .scrapy
 # Sphinx documentation
 docs/_build/
 # PyBuilder
 target/
 # Jupyter Notebook
 .ipynb_checkpoints
 # IPython
 profile_default/
 ipython_config.py
 # pyenv
 .python-version
 # pipenv
 #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 #   install all needed dependencies.
 #Pipfile.lock
 # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 __pypackages__/
 # Celery stuff
 celerybeat-schedule
 celerybeat.pid
 # SageMath parsed files
 *.sage.py
 # Environments
 .env
 .venv
 env/
 venv/
 ENV/
 env.bak/
 venv.bak/
 # Spyder project settings
 .spyderproject
 .spyproject
 # Rope project settings
 .ropeproject
 # mkdocs documentation
 /site
 # mypy
 .mypy_cache/
 .dmypy.json
 dmypy.json
 # Pyre type checker
 .pyre/
 # pytype static type analyzer
 .pytype/
 # End of https://www.toptal.com/developers/gitignore/api/python,latex
--- a/joint_paraphrase_model/README.md
+++ b/joint_paraphrase_model/README.md
@ -0,0 +1,3 @@
 # joint_paraphrase_model
 joint training paraphrase model --- neurips 
--- a/joint_paraphrase_model/config.py
+++ b/joint_paraphrase_model/config.py
@ -0,0 +1,112 @@
 import os
 import torch
 # general
 DEV = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 PAD = "<__PAD__>"
 UNK = "<__UNK__>"
 NOFIX = "<__NOFIX__>"
 SOS = "<__SOS__>"
 EOS = "<__EOS__>"
 batch_size = 1
 teacher_forcing_ratio = 0.5
 embedding_dim = 300
 fix_hidden_dim = 128
 par_hidden_dim = 1024
 fix_dropout = 0.5
 par_dropout = 0.2
 _fix_learning_rate = 0.00001
 _par_learning_rate = 0.0001
 learning_rate = _par_learning_rate
 fix_momentum = 0.9
 par_momentum = 0.0
 max_length = 121
 epochs = 5
 # paths
 data_path = "./data"
 provo_predictability_path = os.path.join(
    data_path, "datasets/provo/Provo_Corpus-Predictability_Norms.csv"
 )
 provo_eyetracking_path = os.path.join(
    data_path, "datasets/provo/Provo_Corpus-Eyetracking_Data.csv"
 )
 geco_en_path = os.path.join(data_path, "datasets/geco/EnglishMaterial.csv")
 geco_mono_path = os.path.join(data_path, "datasets/geco/MonolingualReadingData.csv")
 movieqa_human_path = os.path.join(data_path, "datasets/all_word_scores_fixations")
 movieqa_human_path_2 = os.path.join(
    data_path, "datasets/all_word_scores_fixations_exp2"
 )
 movieqa_human_path_3 = os.path.join(
    data_path, "datasets/all_word_scores_fixations_exp3"
 )
 movieqa_split_plot_path = os.path.join(data_path, "datasets/split_plot_UNRESOLVED")
 cnn_path = os.path.join(
    data_path,
    "projects/2019/fixation_prediction/ez-reader-wrapper/predictability/output_cnn/",
 )
 dm_path = os.path.join(
    data_path,
    "projects/2019/fixation_prediction/ez-reader-wrapper/predictability/output_dm/",
 )
 qqp_paws_basedir = os.path.join(data_path, "datasets/paw_google/qqp/paws_qqp/output")
 qqp_paws_train_path = os.path.join(qqp_paws_basedir, "train.tsv")
 qqp_paws_dev_path = os.path.join(qqp_paws_basedir, "dev.tsv")
 qqp_paws_test_path = os.path.join(qqp_paws_basedir, "test.tsv")
 qqp_basedir = os.path.join(data_path, "datasets/Quora_question_pair_partition_OG")
 qqp_train_path = os.path.join(qqp_basedir, "train.tsv")
 qqp_dev_path = os.path.join(qqp_basedir, "dev.tsv")
 qqp_test_path = os.path.join(qqp_basedir, "test.tsv")
 qqp_kag_basedir = os.path.join(data_path, "datasets/Quora_question_pair_partition_kag")
 qqp_kag_train_path = os.path.join(qqp_kag_basedir, "train.tsv")
 qqp_kag_dev_path = os.path.join(qqp_kag_basedir, "dev.tsv")
 qqp_kag_test_path = os.path.join(qqp_kag_basedir, "test.tsv")
 wiki_basedir = os.path.join(data_path, "datasets/paw_google/wiki")
 wiki_train_path = os.path.join(wiki_basedir, "train.tsv")
 wiki_dev_path = os.path.join(wiki_basedir, "dev.tsv")
 wiki_test_path = os.path.join(wiki_basedir, "test.tsv")
 msrpc_basedir = os.path.join(data_path, "datasets/MSRPC")
 msrpc_train_path = os.path.join(msrpc_basedir, "msr_paraphrase_train.txt")
 msrpc_dev_path = os.path.join(msrpc_basedir, "msr_paraphrase_dev.txt")
 msrpc_test_path = os.path.join(msrpc_basedir, "msr_paraphrase_test.txt")
 sentiment_basedir = os.path.join(data_path, "datasets/sentiment_kag")
 sentiment_train_path = os.path.join(sentiment_basedir, "train.tsv")
 sentiment_dev_path = os.path.join(sentiment_basedir, "dev.tsv")
 sentiment_test_path = os.path.join(sentiment_basedir, "test.tsv")
 tamil_basedir = os.path.join(data_path, "datasets/en-ta-parallel-v2")
 tamil_train_path = os.path.join(tamil_basedir, "corpus.bcn.train.enta")
 tamil_dev_path = os.path.join(tamil_basedir, "corpus.bcn.dev.enta")
 tamil_test_path = os.path.join(tamil_basedir, "corpus.bcn.test.enta")
 compression_basedir = os.path.join(data_path, "datasets/sentence-compression/data")
 compression_train_path = os.path.join(compression_basedir, "train.tsv")
 compression_dev_path = os.path.join(compression_basedir, "dev.tsv")
 compression_test_path = os.path.join(compression_basedir, "test.tsv")
 stanford_basedir = os.path.join(data_path, "datasets/stanfordSentimentTreebank")
 stanford_train_path = os.path.join(stanford_basedir, "train.tsv")
 stanford_dev_path = os.path.join(stanford_basedir, "dev.tsv")
 stanford_test_path = os.path.join(stanford_basedir, "test.tsv")
 stanford_sent_basedir = os.path.join(data_path, "datasets/stanfordSentimentTreebank")
 stanford_sent_train_path = os.path.join(stanford_basedir, "train_sent.tsv")
 stanford_sent_dev_path = os.path.join(stanford_basedir, "dev_sent.tsv")
 stanford_sent_test_path = os.path.join(stanford_basedir, "test_sent.tsv")
 emb_path = os.path.join(data_path, "Google_word2vec/GoogleNews-vectors-negative300.bin")
 glove_path = "glove.840B.300d.txt"
--- a/joint_paraphrase_model/data
+++ b/joint_paraphrase_model/data
@ -0,0 +1 @@
 /netpool/work/gpu-2/users/soodea/
--- a/joint_paraphrase_model/glove.840B.300d.txt
+++ b/joint_paraphrase_model/glove.840B.300d.txt
@ -0,0 +1 @@
 /netpool/work/gpu-2/users/soodea/datasets/glove/glove.840B.300d.txt
--- a/joint_paraphrase_model/glove.cache
+++ b/joint_paraphrase_model/glove.cache
--- a/joint_paraphrase_model/libs/init.py
+++ b/joint_paraphrase_model/libs/init.py
--- a/joint_paraphrase_model/libs/corpora.py
+++ b/joint_paraphrase_model/libs/corpora.py
@ -0,0 +1,416 @@
 import logging
 import config
 def tokenize(sent):
    return sent.split(" ")
 class Lang:
    """Represents the vocabulary
    """
    def __init__(self, name):
        self.name = name
        self.word2index = {
            config.PAD: 0,
            config.UNK: 1,
            config.NOFIX: 2,
            config.SOS: 3,
            config.EOS: 4,
        }
        self.word2count = {}
        self.index2word = {
            0: config.PAD,
            1: config.UNK,
            2: config.NOFIX,
            3: config.SOS,
            4: config.EOS,
        }
        self.n_words = 5
    def add_sentence(self, sentence):
        assert isinstance(
            sentence, (list, tuple)
        ), "input to add_sentence must be tokenized"
        for word in sentence:
            self.add_word(word)
    def add_word(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.word2count[word] = 1
            self.index2word[self.n_words] = word
            self.n_words += 1
        else:
            self.word2count[word] += 1
    def __add__(self, other):
        """Returns a new Lang object containing the vocabulary from this and
        the other Lang object
        """
        new_lang = Lang(f"{self.name}_{other.name}")
        # Add vocabulary from both Langs
        for word in self.word2count.keys():
            new_lang.add_word(word)
        for word in other.word2count.keys():
            new_lang.add_word(word)
        # Fix the counts on the new one
        for word in new_lang.word2count.keys():
            new_lang.word2count[word] = self.word2count.get(
                word, 0
            ) + other.word2count.get(word, 0)
        return new_lang
 def load_wiki(split):
    """Load the Wiki from PAWs"""
    logger = logging.getLogger(f"{__name__}.load_wiki")
    lang = Lang("wiki")
    if split == "train":
        path = config.wiki_train_path
    elif split == "val":
        path = config.wiki_dev_path
    elif split == "test":
        path = config.wiki_test_path
    logger.info("loading %s from %s" % (split, path))
    pairs = []
    with open(path) as handle:
        # skip header
        handle.readline()
        for line in handle:
            _, sent1, sent2, rating = line.strip().split("\t")
            if rating == "0":
                continue
            sent1 = tokenize(sent1)
            sent2 = tokenize(sent2)
            lang.add_sentence(sent1)
            lang.add_sentence(sent2)
            # pairs.append([sent1, sent2, rating])
            pairs.append([sent1, sent2])
    # MS makes the vocab for paraphrase the same
    return pairs, lang
 def load_qqp_paws(split):
    """Load the QQP from PAWs"""
    logger = logging.getLogger(f"{__name__}.load_qqp_paws")
    lang = Lang("qqp_paws")
    if split == "train":
        path = config.qqp_paws_train_path
    elif split == "val":
        path = config.qqp_paws_dev_path
    elif split == "test":
        path = config.qqp_paws_test_path
    logger.info("loading %s from %s" % (split, path))
    pairs = []
    with open(path) as handle:
        # skip header
        handle.readline()
        for line in handle:
            _, sent1, sent2, rating = line.strip().split("\t")
            if rating == "0":
                continue
            sent1 = tokenize(sent1)
            sent2 = tokenize(sent2)
            lang.add_sentence(sent1)
            lang.add_sentence(sent2)
            # pairs.append([sent1, sent2, rating])
            pairs.append([sent1, sent2])
    # MS makes the vocab for paraphrase the same
    return pairs, lang
 def load_qqp(split):
    """Load the QQP from Original""" 
    logger = logging.getLogger(f"{__name__}.load_qqp")
    lang = Lang("qqp")
    if split == "train":
        path = config.qqp_train_path
    elif split == "val":
        path = config.qqp_dev_path
    elif split == "test":
        path = config.qqp_test_path
    logger.info("loading %s from %s" % (split, path))
    pairs = []
    with open(path) as handle:
        # skip header
        handle.readline()
        for line in handle: 
            rating, sent1, sent2, _ = line.strip().split("\t")
            if rating == "0":
                continue
            sent1 = tokenize(sent1)
            sent2 = tokenize(sent2)
            lang.add_sentence(sent1)
            lang.add_sentence(sent2)
            # pairs.append([sent1, sent2, rating])
            pairs.append([sent1, sent2])
    # MS makes the vocab for paraphrase the same
    return pairs, lang
 def load_qqp_kag(split):
    """Load the QQP from Kaggle""" #not original right now, expriemnting with kaggle 100K, 3K, 30K split
    logger = logging.getLogger(f"{__name__}.load_qqp_kag")
    lang = Lang("qqp_kag")
    if split == "train":
        path = config.qqp_kag_train_path
    elif split == "val":
        path = config.qqp_kag_dev_path
    elif split == "test":
        path = config.qqp_kag_test_path
    logger.info("loading %s from %s" % (split, path))
    pairs = []
    with open(path) as handle:
        # skip header
        handle.readline()
        for line in handle: #when reading the kag version we do not have 4 fields, but rather 3
            rating, sent1, sent2 = line.strip().split("\t")
            if rating == "0":
                continue
            sent1 = tokenize(sent1)
            sent2 = tokenize(sent2)
            lang.add_sentence(sent1)
            lang.add_sentence(sent2)
            # pairs.append([sent1, sent2, rating])
            pairs.append([sent1, sent2])
    # MS makes the vocab for paraphrase the same
    return pairs, lang
 def load_msrpc(split):
    """Load the Microsoft Research Paraphrase Corpus (MSRPC)"""
    logger = logging.getLogger(f"{__name__}.load_msrpc")
    lang = Lang("msrpc")
    if split == "train":
        path = config.msrpc_train_path
    elif split == "val":
        path = config.msrpc_dev_path
    elif split == "test":
        path = config.msrpc_test_path
    logger.info("loading %s from %s" % (split, path))
    pairs = []
    with open(path) as handle:
        # skip header
        handle.readline()
        for line in handle:
            rating, _, _, sent1, sent2 = line.strip().split("\t")
            if rating == "0":
                continue
            sent1 = tokenize(sent1)
            sent2 = tokenize(sent2)
            lang.add_sentence(sent1)
            lang.add_sentence(sent2)
            # pairs.append([sent1, sent2, rating])
            pairs.append([sent1, sent2])
    # return src_lang, dst_lang, pairs
    # MS makes the vocab for paraphrase the same
    return pairs, lang
 def load_sentiment(split):
    """Load the Sentiment Kaggle Comp Dataset"""
    logger = logging.getLogger(f"{__name__}.load_sentiment")
    lang = Lang("sentiment")
    if split == "train":
        path = config.sentiment_train_path
    elif split == "val":
        path = config.sentiment_dev_path
    elif split == "test":
        path = config.sentiment_test_path
    logger.info("loading %s from %s" % (split, path))
    pairs = []
    with open(path) as handle:
        # skip header
        handle.readline()
        for line in handle:
            _, _, sent1, sent2 = line.strip().split("\t")
            sent1 = tokenize(sent1)
            sent2 = tokenize(sent2)
            lang.add_sentence(sent1)
            lang.add_sentence(sent2)
            # pairs.append([sent1, sent2, rating])
            pairs.append([sent1, sent2])
    return pairs, lang
 def load_tamil(split):
    """Load the En to Tamil dataset, current SOTA ~13 bleu"""
    logger = logging.getLogger(f"{__name__}.load_tamil")
    lang = Lang("tamil")
    if split == "train":
        path = config.tamil_train_path
    elif split == "val":
        path = config.tamil_dev_path
    elif split == "test":
        path = config.tamil_test_path
    logger.info("loading %s from %s" % (split, path))
    pairs = []
    with open(path) as handle:
        handle.readline()
        for line in handle:
            sent1, sent2 = line.strip().split("\t")
            #if rating == "0":
            #    continue
            sent1 = tokenize(sent1)
            #I dunno how to tokenize tamil.....?
            sent2 = tokenize(sent2)
            lang.add_sentence(sent1)
            lang.add_sentence(sent2)
            pairs.append([sent1, sent2])
    return pairs, lang
 def load_compression(split):
    """Load the Google Sentence Compression Dataset"""
    logger = logging.getLogger(f"{__name__}.load_compression")
    lang = Lang("compression")
    if split == "train":
        path = config.compression_train_path
    elif split == "val":
        path = config.compression_dev_path 
    elif split == "test":
        path = config.compression_test_path
    logger.info("loading %s from %s" % (split, path))
    pairs = []
    with open(path) as handle:
        handle.readline()
        for line in handle:
            sent1, sent2 = line.strip().split("\t")
            sent1 = tokenize(sent1)
            sent2 = tokenize(sent2)
           # print(len(sent1), sent1)
           # print(len(sent2), sent2)
           # print()
            lang.add_sentence(sent1)
            lang.add_sentence(sent2)
            pairs.append([sent1, sent2])
    return pairs, lang
 def load_stanford(split):
    """Load the Stanford Sentiment Dataset phrases"""
    logger = logging.getLogger(f"{__name__}.load_stanford")
    lang = Lang("stanford")
    if split == "train":
        path = config.stanford_train_path
    elif split == "val":
        path = config.stanford_dev_path
    elif split == "test":
        path = config.stanford_test_path
    logger.info("loading %s from %s" % (split, path))
    pairs = []
    with open(path) as handle:
        # skip header
        #handle.readline()
        for line in handle:
            _, _, sent1, sent2 = line.strip().split("\t")
            sent1 = tokenize(sent1)
            sent2 = tokenize(sent2)
            lang.add_sentence(sent1)
            lang.add_sentence(sent2)
            # pairs.append([sent1, sent2, rating])
            pairs.append([sent1, sent2])
    return pairs, lang
 def load_stanford_sent(split):
    """Load the Stanford Sentiment Dataset sentences"""
    logger = logging.getLogger(f"{__name__}.load_stanford_sent")
    lang = Lang("stanford_sent")
    if split == "train":
        path = config.stanford_sent_train_path
    elif split == "val":
        path = config.stanford_sent_dev_path
    elif split == "test":
        path = config.stanford_sent_test_path
    logger.info("loading %s from %s" % (split, path))
    pairs = []
    with open(path) as handle:
        # skip header
        #handle.readline()
        for line in handle:
            _, _, sent1, sent2 = line.strip().split("\t")
            sent1 = tokenize(sent1)
            sent2 = tokenize(sent2)
            lang.add_sentence(sent1)
            lang.add_sentence(sent2)
            # pairs.append([sent1, sent2, rating])
            pairs.append([sent1, sent2])
    return pairs, lang
--- a/joint_paraphrase_model/libs/fixation_generation/init.py
+++ b/joint_paraphrase_model/libs/fixation_generation/init.py
@ -0,0 +1 @@
 from .main import *
--- a/joint_paraphrase_model/libs/fixation_generation/main.py
+++ b/joint_paraphrase_model/libs/fixation_generation/main.py
@ -0,0 +1,131 @@
 from collections import OrderedDict
 import logging
 import sys
 from .self_attention import Transformer
 import numpy as np
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 from torch.nn.utils.rnn import pack_sequence, pack_padded_sequence, pad_packed_sequence
 def random_embedding(vocab_size, embedding_dim):
    pretrain_emb = np.empty([vocab_size, embedding_dim])
    scale = np.sqrt(3.0 / embedding_dim)
    for index in range(vocab_size):
        pretrain_emb[index, :] = np.random.uniform(-scale, scale, [1, embedding_dim])
    return pretrain_emb
 def neg_log_likelihood_loss(outputs, batch_label, batch_size, seq_len):
    outputs = outputs.view(batch_size * seq_len, -1)
    score = F.log_softmax(outputs, 1)
    loss = nn.NLLLoss(ignore_index=0, size_average=False)(
        score, batch_label.view(batch_size * seq_len)
    )
    loss = loss / batch_size
    _, tag_seq = torch.max(score, 1)
    tag_seq = tag_seq.view(batch_size, seq_len)
    # print(score[0], tag_seq[0])
    return loss, tag_seq
 def mse_loss(outputs, batch_label, batch_size, seq_len, word_seq_length):
    # score = torch.nn.functional.softmax(outputs, 1)
    score = torch.sigmoid(outputs)
    mask = torch.zeros_like(score)
    for i, v in enumerate(word_seq_length):
        mask[i, 0:v] = 1
    score = score * mask
    loss = nn.MSELoss(reduction="sum")(
        score.view(batch_size, seq_len), batch_label.view(batch_size, seq_len)
    )
    loss = loss / batch_size
    return loss, score.view(batch_size, seq_len)
 class Network(nn.Module):
    def __init__(
        self,
        embedding_type,
        vocab_size,
        embedding_dim,
        dropout,
        hidden_dim,
        embeddings=None,
        attention=True,
    ):
        super().__init__()
        self.logger = logging.getLogger(f"{__name__}")
        prelayers = OrderedDict()
        postlayers = OrderedDict()
        if embedding_type in ("w2v", "glove"):
            if embeddings is not None:
                prelayers["embedding_layer"] = nn.Embedding.from_pretrained(embeddings)
            else:
                prelayers["embedding_layer"] = nn.Embedding(vocab_size, embedding_dim)
            prelayers["embedding_dropout_layer"] = nn.Dropout(dropout)
            embedding_dim = 300
        elif embedding_type == "bert":
            embedding_dim = 768
        self.lstm = BiLSTM(embedding_dim, hidden_dim // 2, num_layers=1)
        postlayers["lstm_dropout_layer"] = nn.Dropout(dropout)
        if attention:
            # increased compl with 1024D, and 16,16: for no att and att experiments
            # before: for the initial att and pretraining: heads 4 and layers 4, 128D
            # then was 128 D with heads 4 layer 1 = results for all IUI
            ###postlayers["position_encodings"] = PositionalEncoding(hidden_dim)
            postlayers["attention_layer"] = Transformer(
                d_model=hidden_dim, n_heads=4, n_layers=1
            )
        postlayers["ff_layer"] = nn.Linear(hidden_dim, hidden_dim // 2)
        postlayers["ff_activation"] = nn.ReLU()
        postlayers["output_layer"] = nn.Linear(hidden_dim // 2, 1)
        self.logger.info(f"prelayers: {prelayers.keys()}")
        self.logger.info(f"postlayers: {postlayers.keys()}")
        self.pre = nn.Sequential(prelayers)
        self.post = nn.Sequential(postlayers)
    def forward(self, x, word_seq_length):
        x = self.pre(x)
        x = self.lstm(x, word_seq_length)
        #MS pritning fix model params
        #for p in self.parameters():
        #    print(p.data)
        #    break
        return self.post(x.transpose(1, 0))
 class BiLSTM(nn.Module):
    def __init__(self, embedding_dim, lstm_hidden, num_layers):
        super().__init__()
        self.net = nn.LSTM(
            input_size=embedding_dim,
            hidden_size=lstm_hidden,
            num_layers=num_layers,
            batch_first=True,
            bidirectional=True,
        )
    def forward(self, x, word_seq_length):
        packed_words = pack_padded_sequence(x, word_seq_length, True, False)
        lstm_out, hidden = self.net(packed_words)
        lstm_out, _ = pad_packed_sequence(lstm_out)
        return lstm_out
--- a/joint_paraphrase_model/libs/fixation_generation/self_attention.py
+++ b/joint_paraphrase_model/libs/fixation_generation/self_attention.py
@ -0,0 +1,131 @@
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 import numpy as np
 import math
 class PositionalEncoding(nn.Module):
    def __init__(self, d_hid, n_position=200):
        super(PositionalEncoding, self).__init__()
        # Not a parameter
        self.register_buffer('pos_table', self._get_sinusoid_encoding_table(n_position, d_hid))
    def _get_sinusoid_encoding_table(self, n_position, d_hid):
        ''' Sinusoid position encoding table '''
        # TODO: make it with torch instead of numpy
        def get_position_angle_vec(position):
            return [position / np.power(10000, 2 * (hid_j // 2) / d_hid) for hid_j in range(d_hid)]
        sinusoid_table = np.array([get_position_angle_vec(pos_i) for pos_i in range(n_position)])
        sinusoid_table[:, 0::2] = np.sin(sinusoid_table[:, 0::2])  # dim 2i
        sinusoid_table[:, 1::2] = np.cos(sinusoid_table[:, 1::2])  # dim 2i+1
        return torch.FloatTensor(sinusoid_table).unsqueeze(0)
    def forward(self, x):
        return x + self.pos_table[:, :x.size(1)].clone().detach()
 class AttentionLayer(nn.Module):
    def __init__(self):
        super(AttentionLayer, self).__init__()
    def forward(self, Q, K, V):
        # Q: float32:[batch_size, n_queries, d_k]
        # K: float32:[batch_size, n_keys, d_k]
        # V: float32:[batch_size, n_keys, d_v]
        dk = K.shape[-1]
        dv = V.shape[-1]
        KT = torch.transpose(K, -1, -2)
        weight_logits = torch.bmm(Q, KT) / math.sqrt(dk)
        # weight_logits: float32[batch_size, n_queries, n_keys]
        weights = F.softmax(weight_logits, dim=-1)
        # weight: float32[batch_size, n_queries, n_keys]
        return torch.bmm(weights, V)
        # return float32[batch_size, n_queries, dv]
 class MultiHeadedSelfAttentionLayer(nn.Module):
    def __init__(self, d_model, n_heads):
        super(MultiHeadedSelfAttentionLayer, self).__init__()
        self.d_model = d_model
        self.n_heads = n_heads
        print('{} {}'.format(d_model, n_heads))
        assert d_model % n_heads == 0
        self.d_k = d_model // n_heads
        self.d_v = self.d_k
        self.attention_layer = AttentionLayer()
        self.W_Qs = nn.ModuleList([
                nn.Linear(d_model, self.d_k, bias=False)
                for _ in range(n_heads)
        ])
        self.W_Ks = nn.ModuleList([
                nn.Linear(d_model, self.d_k, bias=False)
                for _ in range(n_heads)
        ])
        self.W_Vs = nn.ModuleList([
                nn.Linear(d_model, self.d_v, bias=False)
                for _ in range(n_heads)
        ])
        self.W_O = nn.Linear(d_model, d_model, bias=False)
    def forward(self, x):
        # x:float32[batch_size, sequence_length, self.d_model]
        head_outputs = []
        for W_Q, W_K, W_V in zip(self.W_Qs, self.W_Ks, self.W_Vs):
            Q = W_Q(x)
            # Q float32:[batch_size, sequence_length, self.d_k]
            K = W_K(x)
            # Q float32:[batch_size, sequence_length, self.d_k]
            V = W_V(x)
            # Q float32:[batch_size, sequence_length, self.d_v]
            head_output = self.attention_layer(Q, K, V)
            # float32:[batch_size, sequence_length, self.d_v]
            head_outputs.append(head_output)
        concatenated = torch.cat(head_outputs, dim=-1)
        # concatenated float32:[batch_size, sequence_length, self.d_model]
        out = self.W_O(concatenated)
        # out float32:[batch_size, sequence_length, self.d_model]
        return out
 class Feedforward(nn.Module):
    def __init__(self, d_model):
        super(Feedforward, self).__init__()
        self.d_model = d_model
        self.W1 = nn.Linear(d_model, d_model)
        self.W2 = nn.Linear(d_model, d_model)
    def forward(self, x):
        # x: float32[batch_size, sequence_length, d_model]
        return self.W2(torch.relu(self.W1(x)))
 class Transformer(nn.Module):
    def __init__(self, d_model, n_heads, n_layers):
        super(Transformer, self).__init__()
        self.d_model = d_model
        self.n_heads = n_heads
        self.n_layers = n_layers
        self.attention_layers = nn.ModuleList([
            MultiHeadedSelfAttentionLayer(d_model, n_heads)
            for _ in range(n_layers)
        ])
        self.ffs = nn.ModuleList([
            Feedforward(d_model)
            for _ in range(n_layers)
        ])
    def forward(self, x):
        # x: float32[batch_size, sequence_length, self.d_model]
        for attention_layer, ff in zip(self.attention_layers, self.ffs):
            attention_out = attention_layer(x)
            # attention_out: float32[batch_size, sequence_length, self.d_model]
            x = F.layer_norm(x + attention_out, x.shape[2:])
            ff_out = ff(x)
            # ff_out: float32[batch_size, sequence_length, self.d_model]
            x = F.layer_norm(x + ff_out, x.shape[2:])
        return x
--- a/joint_paraphrase_model/libs/paraphrase_generation/init.py
+++ b/joint_paraphrase_model/libs/paraphrase_generation/init.py
@ -0,0 +1 @@
 from .main import *
--- a/joint_paraphrase_model/libs/paraphrase_generation/main.py
+++ b/joint_paraphrase_model/libs/paraphrase_generation/main.py
@ -0,0 +1,86 @@
 import json
 import math
 import os
 import random
 import time
 import matplotlib.pyplot as plt
 import matplotlib.ticker as ticker
 import numpy as np
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 import torch.optim as optim
 class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size, embeddings):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.embedding = nn.Embedding.from_pretrained(embeddings)
        self.gru = nn.GRU(input_size, hidden_size)
    def forward(self, input, hidden):
        embedded = self.embedding(input).view(1, 1, -1)
        output = embedded
        output, hidden = self.gru(output, hidden)
        return output, hidden
    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size)
 class AttnDecoderRNN(nn.Module):
    def __init__(
        self,
        input_size,
        hidden_size,
        output_size,
        embeddings,
        dropout_p,
        max_length,
    ):
        super(AttnDecoderRNN, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.dropout_p = dropout_p
        self.max_length = max_length
        self.embedding = nn.Embedding.from_pretrained(embeddings) #for paragen
        #self.embedding = nn.Embedding(len(embeddings), 300) #for NMT with tamil, trying wiht senitment too
        self.attn = nn.Linear(self.input_size + self.hidden_size, self.max_length)
        self.attn_combine = nn.Linear(
            self.input_size + self.hidden_size, self.hidden_size
        )
        self.dropout = nn.Dropout(self.dropout_p)
        self.gru = nn.GRU(self.hidden_size, self.hidden_size)
        self.out = nn.Linear(self.hidden_size, self.output_size)
    def forward(self, input, hidden, encoder_outputs, fixations):
        embedded = self.embedding(input).view(1, 1, -1)
        embedded = self.dropout(embedded)
        attn_weights = F.softmax(
            self.attn(torch.cat((embedded[0], hidden[0]), 1)), dim=1
        )
        attn_weights = attn_weights * torch.nn.ConstantPad1d((0, attn_weights.shape[-1] - fixations.shape[-2]), 0)(fixations.squeeze().unsqueeze(0))
        # attn_weights = torch.softmax(attn_weights * torch.nn.ConstantPad1d((0, attn_weights.shape[-1] - fixations.shape[-2]), 0)(fixations.squeeze().unsqueeze(0)), dim=1)
        attn_applied = torch.bmm(
            attn_weights.unsqueeze(0), encoder_outputs.unsqueeze(0)
        )
        output = torch.cat((embedded[0], attn_applied[0]), 1)
        output = self.attn_combine(output).unsqueeze(0)
        output = F.relu(output)
        output, hidden = self.gru(output, hidden)
        # output = F.log_softmax(self.out(output[0]), dim=1)
        output = self.out(output[0])
        # output = F.log_softmax(output, dim=1)
        return output, hidden, attn_weights
--- a/joint_paraphrase_model/libs/utils.py
+++ b/joint_paraphrase_model/libs/utils.py
@ -0,0 +1,225 @@
 import json
 import logging
 import math
 import os
 import random
 import re
 import time
 import matplotlib.pyplot as plt
 import matplotlib.ticker as ticker
 from nltk.translate.bleu_score import sentence_bleu
 import numpy as np
 import torch
 import torch.nn as nn
 import config
 plt.switch_backend("agg")
 def load_glove(vocabulary):
    logger = logging.getLogger(f"{__name__}.load_glove")
    logger.info("loading embeddings")
    try:
        with open(f"glove.cache") as h:
            cache = json.load(h)
    except:
        logger.info("cache doesn't exist")
        cache = {}
        cache[config.PAD] = [0] * 300
        cache[config.SOS] = [0] * 300
        cache[config.EOS] = [0] * 300
        cache[config.UNK] = [0] * 300
        cache[config.NOFIX] = [0] * 300
    else:
        logger.info("cache found")
    cache_miss = False
    if not set(vocabulary) <= set(cache):
        cache_miss = True
        logger.warn("cache miss, loading full embeddings")
        data = {}
        with open("glove.840B.300d.txt") as h:
            for line in h:
                word, *emb = line.strip().split()
                try:
                    data[word] = [float(x) for x in emb]
                except:
                    continue
        logger.info("finished loading full embeddings")
        for word in vocabulary:
            try:
                cache[word] = data[word]
            except KeyError:
                cache[word] = [0] * 300
        logger.info("cache updated")
    embeddings = []
    for word in vocabulary:
        embeddings.append(torch.tensor(cache[word], dtype=torch.float32))
    embeddings = torch.stack(embeddings)
    if cache_miss:
        with open(f"glove.cache", "w") as h:
            json.dump(cache, h)
        logger.info("cache saved")
    return embeddings
 def tokenize(s):
    s = s.lower().strip()
    s = re.sub(r"([.!?])", r" \1", s)
    s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
    s = s.split(" ")
    return s
 def indices_from_sentence(word2index, sentence, unknown_threshold):
    if unknown_threshold:
        return [
            word2index.get(
                word if random.random() > unknown_threshold else config.UNK,
                word2index[config.UNK],
            )
            for word in sentence
        ]
    else:
        return [
            word2index.get(word, word2index[config.UNK]) for word in sentence
        ]
 def tensor_from_sentence(word2index, sentence, unknown_threshold):
    # indices = [config.SOS]
    indices = indices_from_sentence(word2index, sentence, unknown_threshold)
    indices.append(word2index[config.EOS])
    return torch.tensor(indices, dtype=torch.long, device=config.DEV)
 def tensors_from_pair(word2index, pair, shuffle, unknown_threshold):
    tensors = [
        tensor_from_sentence(word2index, pair[0], unknown_threshold),
        tensor_from_sentence(word2index, pair[1], unknown_threshold),
    ]
    if shuffle:
        random.shuffle(tensors)
    return tensors
 def bleu(reference, hypothesis, n=4): #not sure if this actually changes the n gram
    if n < 1:
        return 0
    weights = [1/n]*n
    return sentence_bleu([reference], hypothesis, weights)
 def pair_iter(pairs, word2index, shuffle=False, shuffle_pairs=False, unknown_threshold=0.00):
    if shuffle:
        pairs = pairs.copy()
        random.shuffle(pairs)
    for pair in pairs:
        tensor1, tensor2 = tensors_from_pair(word2index, (pair[0], pair[1]), shuffle_pairs, unknown_threshold)
        yield (tensor1,), (tensor2,)
 def sent_iter(sents, word2index, unknown_threshold=0.00):
    for sent in sents:
        tensor = tensor_from_sentence(word2index, sent, unknown_threshold)
        yield (tensor,)
 def batch_iter(pairs, word2index, batch_size, shuffle=False, unknown_threshold=0.00):
    for i in range(len(pairs) // batch_size):
        batch = pairs[i : i + batch_size]
        if len(batch) != batch_size:
            continue
        batch_tensors = [
            tensors_from_pair(word2index, (pair[0], pair[1]), shuffle, unknown_threshold)
            for pair in batch
        ]
        tensors1, tensors2 = zip(*batch_tensors)
        # targets = torch.tensor(targets, dtype=torch.long, device=config.DEV)
        # tensors1_lengths = [len(t) for t in tensors1]
        # tensors2_lengths = [len(t) for t in tensors2]
        # tensors1 = nn.utils.rnn.pack_sequence(tensors1, enforce_sorted=False)
        # tensors2 = nn.utils.rnn.pack_sequence(tensors2, enforce_sorted=False)
        yield tensors1, tensors2
 def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return "%dm %ds" % (m, s)
 def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return "%s (- %s)" % (asMinutes(s), asMinutes(rs))
 def showPlot(points):
    plt.figure()
    fig, ax = plt.subplots()
    # this locator puts ticks at regular intervals
    loc = ticker.MultipleLocator(base=0.2)
    ax.yaxis.set_major_locator(loc)
    plt.plot(points)
 def showAttention(input_sentence, output_words, attentions):
    # Set up figure with colorbar
    fig = plt.figure()
    ax = fig.add_subplot(111)
    cax = ax.matshow(attentions.numpy(), cmap="bone")
    fig.colorbar(cax)
    # Set up axes
    ax.set_xticklabels([""] + input_sentence.split(" ") + ["<__EOS__>"], rotation=90)
    ax.set_yticklabels([""] + output_words)
    # Show label at every tick
    ax.xaxis.set_major_locator(ticker.MultipleLocator(1))
    ax.yaxis.set_major_locator(ticker.MultipleLocator(1))
    plt.show()
 def evaluateAndShowAttention(input_sentence):
    output_words, attentions = evaluate(encoder1, attn_decoder1, input_sentence)
    print("input =", input_sentence)
    print("output =", " ".join(output_words))
    showAttention(input_sentence, output_words, attentions)
 def save_model(model, word2index, path):
    if not path.endswith(".tar"):
        path += ".tar"
    torch.save(
        {"weights": model.state_dict(), "word2index": word2index},
        path,
    )
 def load_model(path):
    checkpoint = torch.load(path)
    return checkpoint["weights"], checkpoint["word2index"]
 def extend_vocabulary(word2index, langs):
    for lang in langs:
        for word in lang.word2index:
            if word not in word2index:
                word2index[word] = len(word2index)
    return word2index
--- a/joint_paraphrase_model/main.py
+++ b/joint_paraphrase_model/main.py
@ -0,0 +1,794 @@
 import logging
 import os
 import pathlib
 import random
 import sys
 import click
 import sacrebleu
 import torch
 import torch.nn as nn
 import tqdm
 import config
 from libs import corpora
 from libs import utils
 from libs.fixation_generation import Network as FixNN
 from libs.paraphrase_generation import (
    EncoderRNN as ParEncNN,
    AttnDecoderRNN as ParDecNN,
 )
 cwd = os.path.dirname(__file__)
 logger = logging.getLogger("main")
 '''
 #qqp_paw sentences:
 debug_sentences = [
    'What are the driving rules in Georgia versus Mississippi ?',
    'I want to be a child psychologist , what qualification do i need to become one ? Are there good and reputed psychology Institute or Colleges in India ?',
    'What is deep web and dark web and what are the contents of these sites ?',
    'What is difference between North Indian Brahmins and South Indian Brahmins ?',
    'Is carbon dioxide an ionic bond or a covalent bond ?',
    'How do accounts receivable and accounts payable differ ?',
    'Why did Wikipedia hide its audit history for ( superluminal ) successful speed experiments ?',
    'What makes a person simple , or inversely , complicated ?',
    '`` How do you say `` Miss you , too , `` in Spanish ? Are there multiple ways to say it ? ‘’',
    'What is the difference between dominant trait and recessive trait ?’',
    '`` What is the difference between `` seeing someone , `` `` dating someone , `` and `` having a girlfriend/boyfriend `` ? ‘’',
    'How was the Empire State building built and designed ? How is it used ?',
    'What is the sum of the square roots of first n natural number ?',
     'Why is Roman Saini not so active on Quora now a days ?',
    'If I have someone blocked on Instagram , and see their story , can they view I viewed it ?',
    'Amongst the major IT companies of India which is the best ; Wipro , Capgemini , Infosys , TCS or is Oracle the best ?',
    'How much mass does Saturn lose each year ? How much mass does it gain ?',
    'What is a cheap healthy diet , I can keep the same and eat every day ?',
    ' What is it like to be beautiful ? Not just pretty or hot , but the kind of almost objective beauty that people are sometimes intimidated by ?',
    'Could someone tell of a James Ronsey ( misspelled likely ) , writer and filmmaker , probably of the British Isles ?',
    'How much pressure Is there around the core of Pluto ? is it enough to turn hydrogen/helium gas into a liquid or metallic state ?',
    'How does quality of life in Vancouver compare to that in Melbourne Or Brisbane ?',
 ]
 '''
 '''
 #wiki sentences:
 debug_sentences = [
    'They were there to enjoy us and they were there to pray for us .',
    'Components of elastic potential systems store mechanical energy if they are deformed when forces are applied to the system .',
    'Steam can also be used , and does not need to be pumped .',
    'The solar approach to this requirement is the use of solar panels in a conventional-powered aircraft .',
    'Daudkhali is a village in Barisal Division in the Pirojpur district in southwestern Bangladesh .',
    'Briggs later met Briggs at the 1967 Monterey Pop Festival , where Ravi Shankar was also performing , with Eric Burdon and The Animals .',
    'Brockton is approximately 25 miles northeast of Providence , Rhode Island , and 30 miles south of Boston .',
 ]
 '''
 #qqp sentences:
 debug_sentences = [
    'How do I get funding for my web based startup idea ?',
    'What do intelligent people do to pass time ?',
    'Which is the best SEO Company in Delhi ?',
    'Why do you waer makeup ?',
    'How do start chatting with a girl ?',
    'What is the meaning of living life ?',
    'Why do my armpits hurt ?',
    'Why does eye color change with age ?',
    'How do you find the standard deviation of a probability distribution ? What are some examples ?',
    'How can I complete my 11 syllabus in one month ?',
    'How do I concentrate better on my studies ?',
    'Which is the best retirement plan in india ?',
    'Should I tell my best friend I love her ?',
    'Which is the best company for Appian Vagrant online job support ?',
    'How can one do for good handwriting ?',
    'What are remedies to get rid of belly fat ?',
    'What is the best way to cook precooked turkey ?',
    'What is the future of e-commerce in India ?',
    'Why do my burps taste like rotten eggs ?',
    'What is an example of chemical weathering ?',
    'What are some of the advantages and disadvantages of cyber schooling ?',
    'How can I increase traffic to my websites by Facebook ?',
    'How do I increase my patience level in life ?',
    'What are the best hospitals for treating cancer in India ?',
    'Will Jio sim work in a 3G phone ? If yes , how ?',
 ]
 debug_sentences = [s.split(" ") for s in debug_sentences]
 class Network(nn.Module):
    def __init__(
        self,
        word2index,
        embeddings,
    ):
        super().__init__()
        self.logger = logging.getLogger(f"{__name__}")
        self.word2index = word2index
        self.index2word = {i: k for k, i in word2index.items()}
        self.fix_gen = FixNN(
            embedding_type="glove",
            vocab_size=len(word2index),
            embedding_dim=config.embedding_dim,
            embeddings=embeddings,
            dropout=config.fix_dropout,
            hidden_dim=config.fix_hidden_dim,
        )
        self.par_enc = ParEncNN(
            input_size=config.embedding_dim,
            hidden_size=config.par_hidden_dim,
            embeddings=embeddings,
        )
        self.par_dec = ParDecNN(
            input_size=config.embedding_dim,
            hidden_size=config.par_hidden_dim,
            output_size=len(word2index),
            embeddings=embeddings,
            dropout_p=config.par_dropout,
            max_length=config.max_length,
        )
    def forward(self, x, target=None, teacher_forcing_ratio=None):
        teacher_forcing_ratio = teacher_forcing_ratio if teacher_forcing_ratio is not None else config.teacher_forcing_ratio
        x1 = nn.utils.rnn.pad_sequence(x, batch_first=True)
        x2 = nn.utils.rnn.pad_sequence(x, batch_first=False)
        fixations = torch.sigmoid(self.fix_gen(x1, [len(_x) for _x in x1]))
        enc_hidden = self.par_enc.initHidden().to(config.DEV)
        enc_outs = torch.zeros(config.max_length, config.par_hidden_dim, device=config.DEV)
        for ei in range(len(x2)):
            enc_out, enc_hidden = self.par_enc(x2[ei], enc_hidden)
            enc_outs[ei] += enc_out[0, 0]
        dec_in = torch.tensor([[self.word2index[config.SOS]]], device=config.DEV)  # SOS
        dec_hidden = enc_hidden
        dec_outs = []
        dec_words = []
        dec_atts = torch.zeros(config.max_length, config.max_length)
        if target is not None:  # training
            target = nn.utils.rnn.pad_sequence(target, batch_first=False)
            use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False
            if use_teacher_forcing:
                for di in range(len(target)):
                    dec_out, dec_hidden, dec_att = self.par_dec(
                        dec_in, dec_hidden, enc_outs, fixations
                    )
                    dec_outs.append(dec_out)
                    dec_atts[di] = dec_att.data
                    dec_input = target[di]
            else:
                for di in range(len(target)):
                    dec_out, dec_hidden, dec_att = self.par_dec(
                        dec_in, dec_hidden, enc_outs, fixations
                    )
                    dec_outs.append(dec_out)
                    dec_atts[di] = dec_att.data
                    topv, topi = dec_out.data.topk(1)
                    dec_words.append(self.index2word[topi.item()])
                    dec_input = topi.squeeze().detach()
        else:  # prediction
            for di in range(config.max_length):
                dec_out, dec_hidden, dec_att = self.par_dec(
                    dec_in, dec_hidden, enc_outs, fixations
                )
                dec_outs.append(dec_out)
                dec_atts[di] = dec_att.data
                topv, topi = dec_out.data.topk(1)
                if topi.item() == self.word2index[config.EOS]:
                    dec_words.append("<__EOS__>")
                    break
                else:
                    dec_words.append(self.index2word[topi.item()])
                dec_input = topi.squeeze().detach()
        return dec_outs, dec_words, dec_atts[: di + 1], fixations
 def load_corpus(corpus_name, splits):
    if not splits:
        return
    logger.info("loading corpus")
    if corpus_name == "msrpc":
        load_fn = corpora.load_msrpc
    elif corpus_name == "qqp":
        load_fn = corpora.load_qqp
    elif corpus_name == "wiki":
        load_fn = corpora.load_wiki
    elif corpus_name == "qqp_paws":
        load_fn = corpora.load_qqp_paws
    elif corpus_name == "qqp_kag":
        load_fn = corpora.load_qqp_kag
    elif corpus_name == "sentiment":
        load_fn = corpora.load_sentiment
    elif corpus_name == "stanford":
        load_fn = corpora.load_stanford
    elif corpus_name == "stanford_sent":
        load_fn = corpora.load_stanford_sent
    elif corpus_name == "tamil":
        load_fn = corpora.load_tamil
    elif corpus_name == "compression":
        load_fn = corpora.load_compression
    corpus = {}
    langs = []
    if "train" in splits:
        train_pairs, train_lang = load_fn("train")
        corpus["train"] = train_pairs
        langs.append(train_lang)
    if "val" in splits:
        val_pairs, val_lang = load_fn("val")
        corpus["val"] = val_pairs
        langs.append(val_lang)
    if "test" in splits:
        test_pairs, test_lang = load_fn("test")
        corpus["test"] = test_pairs
        langs.append(test_lang)
    logger.info("creating word index")
    lang = langs[0]
    for _lang in langs[1:]:
        lang += _lang
    word2index = lang.word2index
    index2word = {i: w for w, i in word2index.items()}
    return corpus, word2index, index2word
 def init_network(word2index):
    logger.info("loading embeddings")
    vocabulary = sorted(word2index.keys())
    embeddings = utils.load_glove(vocabulary)
    logger.info("initializing model")
    network = Network(
        word2index=word2index,
        embeddings=embeddings,
    )
    network.to(config.DEV)
    print(f"#parameters: {sum(p.numel() for p in network.parameters())}")
    return network
@click.group(context_settings=dict(help_option_names=["-h", "--help"]))
@click.option("-v", "--verbose", count=True)
@click.option("-d", "--debug", is_flag=True)
 def main(verbose, debug):
    if verbose == 0:
        loglevel = logging.ERROR
    elif verbose == 1:
        loglevel = logging.WARN
    elif verbose >= 2:
        loglevel = logging.INFO
    if debug:
        loglevel = logging.DEBUG
    logging.basicConfig(
        format="[%(asctime)s] <%(name)s> %(levelname)s: %(message)s",
        datefmt="%d.%m. %H:%M:%S",
        level=loglevel,
    )
    logger.debug("arguments: %s" % str(sys.argv))
@main.command()
@click.option(
    "-c",
    "--corpus",
    "corpus_name",
    required=True,
    type=click.Choice(sorted(["wiki", "qqp", "qqp_kag", "msrpc", "qqp_paws", "sentiment", "stanford", "stanford_sent", "tamil", "compression"])),
 )
@click.option("-m", "--model_name", required=True)
@click.option("-w", "--fixation_weights", required=False)
@click.option("-f", "--freeze_fixations", is_flag=True, default=False)
@click.option("-b", "--bleu", type=click.Choice(["sacrebleu", "nltk"]), required=True)
 def train(corpus_name, model_name, fixation_weights, freeze_fixations, bleu):
    corpus, word2index, index2word = load_corpus(corpus_name, ["train", "val"])
    train_pairs = corpus["train"]
    val_pairs = corpus["val"]
    network = init_network(word2index)
    model_dir = os.path.join("models", model_name)
    logger.debug("creating model dir %s" % model_dir)
    pathlib.Path(model_dir).mkdir(parents=True)
    if fixation_weights is not None:
        logger.info("loading fixation prediction checkpoint")
        checkpoint = torch.load(fixation_weights, map_location=config.DEV)
        if "word2index" in checkpoint:
            weights = checkpoint["weights"]
        else:
            weights = checkpoint
        # remove the embedding layer before loading
        weights = {k: v for k, v in weights.items() if not k.startswith("pre.embedding_layer")}
        network.fix_gen.load_state_dict(weights, strict=False)
        if freeze_fixations:
            logger.info("freezing fixation generation network")
            for p in network.fix_gen.parameters():
                p.requires_grad = False
    loss_fn = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(network.parameters(), lr=config.learning_rate)
    #optimizer = torch.optim.Adam(network.parameters(), lr=1e-4, weight_decay=1e-5)
    best_val_loss = None
    epoch = 1
    while True:
        train_batch_iter = utils.pair_iter(pairs=train_pairs, word2index=word2index, shuffle=True, shuffle_pairs=False)
        val_batch_iter = utils.pair_iter(pairs=val_pairs, word2index=word2index, shuffle=False, shuffle_pairs=False)
        # test_batch_iter = utils.pair_iter(pairs=test_pairs, word2index=word2index, shuffle=False, shuffle_pairs=False)
        running_train_loss = 0
        total_train_loss = 0
        total_val_loss = 0
        if bleu == "sacrebleu":
            running_train_bleu = 0
            total_train_bleu = 0
            total_val_bleu = 0
        elif bleu == "nltk":
            running_train_bleu_1 = 0
            running_train_bleu_2 = 0
            running_train_bleu_3 = 0
            running_train_bleu_4 = 0
            total_train_bleu_1 = 0
            total_train_bleu_2 = 0
            total_train_bleu_3 = 0
            total_train_bleu_4 = 0
            total_val_bleu_1 = 0
            total_val_bleu_2 = 0
            total_val_bleu_3 = 0
            total_val_bleu_4 = 0
        network.train()
        for i, batch in enumerate(train_batch_iter, 1):
            optimizer.zero_grad()
            input, target = batch
            prediction, words, attention, fixations = network(input, target)
            loss = loss_fn(
                torch.stack(prediction).squeeze(1)[: target[0].shape[0], :], target[0]
            )
            loss.backward()
            optimizer.step()
            running_train_loss += loss.item()
            total_train_loss += loss.item()
            _prediction = " ".join([index2word[_x] for _x in torch.argmax(torch.stack(prediction).squeeze(1)[: target[0].shape[0], :], -1).tolist()])
            _target = " ".join([index2word[_x] for _x in target[0].tolist()])
            if bleu == "sacrebleu":
                bleu_score = sacrebleu.sentence_bleu(_prediction, _target).score
                running_train_bleu += bleu_score
                total_train_bleu += bleu_score
            elif bleu == "nltk":
                bleu_1_score = utils.bleu(target[0].tolist(), torch.argmax(torch.stack(prediction).squeeze(1)[: target[0].shape[0], :], -1).tolist(), n=1)
                bleu_2_score = utils.bleu(target[0].tolist(), torch.argmax(torch.stack(prediction).squeeze(1)[: target[0].shape[0], :], -1).tolist(), n=2)
                bleu_3_score = utils.bleu(target[0].tolist(), torch.argmax(torch.stack(prediction).squeeze(1)[: target[0].shape[0], :], -1).tolist(), n=3)
                bleu_4_score = utils.bleu(target[0].tolist(), torch.argmax(torch.stack(prediction).squeeze(1)[: target[0].shape[0], :], -1).tolist(), n=4)
                running_train_bleu_1 += bleu_1_score
                running_train_bleu_2 += bleu_2_score
                running_train_bleu_3 += bleu_3_score
                running_train_bleu_4 += bleu_4_score
                total_train_bleu_1 += bleu_1_score
                total_train_bleu_2 += bleu_2_score
                total_train_bleu_3 += bleu_3_score
                total_train_bleu_4 += bleu_4_score
            # print(target[0].tolist(), torch.argmax(torch.stack(prediction).squeeze(1)[: target[0].shape[0], :], -1).tolist())
            if i % 100 == 0:
                if bleu == "sacrebleu":
                    print(f"step {i} avg_train_loss {running_train_loss/100:.4f} avg_train_bleu {running_train_bleu/100:.2f}")
                elif bleu == "nltk":
                    print(f"step {i} avg_train_loss {running_train_loss/100:.4f} avg_train_bleu_1 {running_train_bleu_1/100:.2f} avg_train_bleu_2 {running_train_bleu_2/100:.2f} avg_train_bleu_3 {running_train_bleu_3/100:.2f} avg_train_bleu_4 {running_train_bleu_4/100:.2f}")
                network.eval()
                with open(os.path.join(model_dir, f"debug_{epoch}_{i}.out"), "w") as h:
                    if bleu == "sacrebleu":
                        h.write(f"# avg_train_loss {running_train_loss/100:.4f} avg_train_bleu {running_train_bleu/100:.2f}")
                        running_train_bleu = 0
                    elif bleu == "nltk":
                        h.write(f"# avg_train_loss {running_train_loss/100:.4f} avg_train_bleu_1 {running_train_bleu_1/100:.2f} avg_train_bleu_2 {running_train_bleu_2/100:.2f} avg_train_bleu_3 {running_train_bleu_3/100:.2f} avg_train_bleu_4 {running_train_bleu_4/100:.2f}")
                        running_train_bleu_1 = 0
                        running_train_bleu_2 = 0
                        running_train_bleu_3 = 0
                        running_train_bleu_4 = 0
                    running_train_loss = 0
                    h.write("\n")
                    h.write("\t".join(["sentence", "prediction", "attention", "fixations"]))
                    h.write("\n")
                    for s, input in zip(debug_sentences, utils.sent_iter(debug_sentences, word2index=word2index)):
                        prediction, words, attentions, fixations = network(input)
                        prediction = torch.argmax(torch.stack(prediction).squeeze(1), -1).detach().cpu().tolist()
                        prediction = [index2word.get(x, "<__UNK__>") for x in prediction]
                        attentions = attentions.detach().cpu().squeeze().tolist()
                        fixations = fixations.detach().cpu().squeeze().tolist()
                        h.write(f"{s}\t{prediction}\t{attentions}\t{fixations}")
                        h.write("\n")
                network.train()
        network.eval()
        for i, batch in enumerate(val_batch_iter):
            input, target = batch
            prediction, words, attention, fixations = network(input, target, teacher_forcing_ratio=0)
            loss = loss_fn(
                torch.stack(prediction).squeeze(1)[: target[0].shape[0], :], target[0]
            )
            _prediction = " ".join([index2word[_x] for _x in torch.argmax(torch.stack(prediction).squeeze(1)[: target[0].shape[0], :], -1).tolist()])
            _target = " ".join([index2word[_x] for _x in target[0].tolist()])
            if bleu == "sacrebleu":
                bleu_score = sacrebleu.sentence_bleu(_prediction, _target).score
            elif bleu == "nltk":
                bleu_1_score = utils.bleu(target[0].tolist(), torch.argmax(torch.stack(prediction).squeeze(1)[: target[0].shape[0], :], -1).tolist(), n=1)
                bleu_2_score = utils.bleu(target[0].tolist(), torch.argmax(torch.stack(prediction).squeeze(1)[: target[0].shape[0], :], -1).tolist(), n=2)
                bleu_3_score = utils.bleu(target[0].tolist(), torch.argmax(torch.stack(prediction).squeeze(1)[: target[0].shape[0], :], -1).tolist(), n=3)
                bleu_4_score = utils.bleu(target[0].tolist(), torch.argmax(torch.stack(prediction).squeeze(1)[: target[0].shape[0], :], -1).tolist(), n=4)
                total_val_bleu_1 += bleu_1_score
                total_val_bleu_2 += bleu_2_score
                total_val_bleu_3 += bleu_3_score
                total_val_bleu_4 += bleu_4_score
            total_val_loss += loss.item()
        avg_val_loss = total_val_loss/len(val_pairs)
        if bleu == "sacrebleu":
            print(f"epoch {epoch} avg_train_loss {total_train_loss/len(train_pairs):.4f} avg_val_loss {avg_val_loss:.4f} avg_train_bleu {total_train_bleu/len(train_pairs):.2f} avg_val_bleu {total_val_bleu/len(val_pairs):.2f}")
        elif bleu == "nltk":
            print(f"epoch {epoch} avg_train_loss {total_train_loss/len(train_pairs):.4f} avg_val_loss {avg_val_loss:.4f} avg_train_bleu_1 {total_train_bleu_1/len(train_pairs):.2f} avg_train_bleu_2 {total_train_bleu_2/len(train_pairs):.2f} avg_train_bleu_3 {total_train_bleu_3/len(train_pairs):.2f} avg_train_bleu_4 {total_train_bleu_4/len(train_pairs):.2f} avg_val_bleu_1 {total_val_bleu_1/len(val_pairs):.2f} avg_val_bleu_2 {total_val_bleu_2/len(val_pairs):.2f} avg_val_bleu_3 {total_val_bleu_3/len(val_pairs):.2f} avg_val_bleu_4 {total_val_bleu_4/len(val_pairs):.2f}")
        with open(os.path.join(model_dir, f"debug_{epoch}_end.out"), "w") as h:
            if bleu == "sacrebleu":
                h.write(f"# avg_train_loss {total_train_loss/len(train_pairs)} avg_val_loss {total_val_loss/len(val_pairs)} avg_train_bleu {total_train_bleu/len(train_pairs)} avg_val_bleu {total_val_bleu/len(val_pairs)}")
            elif bleu == "nltk":
                h.write(f"# avg_train_loss {total_train_loss/len(train_pairs):.4f} avg_val_loss {avg_val_loss:.4f} avg_train_bleu_1 {total_train_bleu_1/len(train_pairs):.2f} avg_train_bleu_2 {total_train_bleu_2/len(train_pairs):.2f} avg_train_bleu_3 {total_train_bleu_3/len(train_pairs):.2f} avg_train_bleu_4 {total_train_bleu_4/len(train_pairs):.2f} avg_val_bleu_1 {total_val_bleu_1/len(val_pairs):.2f} avg_val_bleu_2 {total_val_bleu_2/len(val_pairs):.2f} avg_val_bleu_3 {total_val_bleu_3/len(val_pairs):.2f} avg_val_bleu_4 {total_val_bleu_4/len(val_pairs):.2f}")
            h.write("\n")
            h.write("\t".join(["sentence", "prediction", "attention", "fixations"]))
            h.write("\n")
            for s, input in zip(debug_sentences, utils.sent_iter(debug_sentences, word2index=word2index)):
                prediction, words, attentions, fixations = network(input)
                prediction = torch.argmax(torch.stack(prediction).squeeze(1), -1).detach().cpu().tolist()
                prediction = [index2word.get(x, "<__UNK__>") for x in prediction]
                attentions = attentions.detach().cpu().squeeze().tolist()
                fixations = fixations.detach().cpu().squeeze().tolist()
                h.write(f"{s}\t{prediction}\t{attentions}\t{fixations}")
                h.write("\n")
        utils.save_model(network, word2index, os.path.join(model_dir, f"{model_name}_{epoch}"))
        if best_val_loss is None or avg_val_loss < best_val_loss:
            if best_val_loss is not None:
                logger.info(f"{avg_val_loss} < {best_val_loss} ({avg_val_loss-best_val_loss}): new best model from epoch {epoch}")
            else:
                logger.info(f"{avg_val_loss} < {best_val_loss}: new best model from epoch {epoch}")
            best_val_loss = avg_val_loss
            # save_model(model, word2index, model_name + "_epoch_" + str(epoch))
            # utils.save_model(network, word2index, os.path.join(model_dir, f"{model_name}_best"))
        epoch += 1
@main.command()
@click.option(
    "-c",
    "--corpus",
    "corpus_name",
    required=True,
    type=click.Choice(sorted(["wiki", "qqp", "qqp_kag", "msrpc", "qqp_paws", "sentiment", "stanford", "stanford_sent", "tamil", "compression"])),
 )
@click.option("-w", "--model_weights", required=True)
@click.option("-s", "--sentence_statistics", is_flag=True)
@click.option("-b", "--bleu", type=click.Choice(["sacrebleu", "nltk"]), required=True)
 def val(corpus_name, model_weights, sentence_statistics, bleu):
    corpus, word2index, index2word = load_corpus(corpus_name, ["val"])
    val_pairs = corpus["val"]
    logger.info("loading model checkpoint")
    checkpoint = torch.load(model_weights, map_location=config.DEV)
    if "word2index" in checkpoint:
        weights = checkpoint["weights"]
        word2index = checkpoint["word2index"]
        index2word = {i: w for w, i in word2index.items()}
    else:
        asdf
    network = init_network(word2index)
    # remove the embedding layer before loading
    weights = {k: v for k, v in weights.items() if not "embedding" in k}
    # make a new output layer to match the weights from the checkpoint
    # we cannot remove it like we did with the embedding layers because
    # unlike those the output layer actually contains learned parameters
    vocab_size, hidden_size = weights["par_dec.out.weight"].shape
    network.par_dec.out = nn.Linear(hidden_size, vocab_size).to(config.DEV)
    # actually load the parameters
    network.load_state_dict(weights, strict=False)
    loss_fn = nn.CrossEntropyLoss()
    val_batch_iter = utils.pair_iter(pairs=val_pairs, word2index=word2index, shuffle=False, shuffle_pairs=False)
    total_val_loss = 0
    if bleu == "sacrebleu":
        total_val_bleu = 0
    elif bleu == "nltk":
        total_val_bleu_1 = 0
        total_val_bleu_2 = 0
        total_val_bleu_3 = 0
        total_val_bleu_4 = 0
    network.eval()
    for i, batch in enumerate(val_batch_iter, 1):
        input, target = batch
        prediction, words, attentions, fixations = network(input, target)
        loss = loss_fn(
            torch.stack(prediction).squeeze(1)[: target[0].shape[0], :], target[0]
        )
        total_val_loss += loss.item()
        _prediction = [index2word[_x] for _x in torch.argmax(torch.stack(prediction).squeeze(1)[: target[0].shape[0], :], -1).tolist()]
        _target = [index2word[_x] for _x in target[0].tolist()]
        if bleu == "sacrebleu":
            bleu_score = sacrebleu.sentence_bleu(" ".join(_prediction), " ".join(_target)).score
            total_val_bleu += bleu_score
        elif bleu == "nltk":
            bleu_1_score = utils.bleu(target[0].tolist(), torch.argmax(torch.stack(prediction).squeeze(1)[: target[0].shape[0], :], -1).tolist(), n=1)
            bleu_2_score = utils.bleu(target[0].tolist(), torch.argmax(torch.stack(prediction).squeeze(1)[: target[0].shape[0], :], -1).tolist(), n=2)
            bleu_3_score = utils.bleu(target[0].tolist(), torch.argmax(torch.stack(prediction).squeeze(1)[: target[0].shape[0], :], -1).tolist(), n=3)
            bleu_4_score = utils.bleu(target[0].tolist(), torch.argmax(torch.stack(prediction).squeeze(1)[: target[0].shape[0], :], -1).tolist(), n=4)
            total_val_bleu_1 += bleu_1_score
            total_val_bleu_2 += bleu_2_score
            total_val_bleu_3 += bleu_3_score
            total_val_bleu_4 += bleu_4_score
        if sentence_statistics:
            s = [index2word[x] for x in input[0].detach().cpu().tolist()]
            attentions = attentions.detach().cpu().squeeze().tolist()
            fixations = fixations.detach().cpu().squeeze().tolist()
            if bleu == "sacrebleu":
                print(f"{bleu_score}\t{s}\t{_prediction}\t{_target}\t{attentions}\t{fixations}")
            elif bleu == "nltk":
                print(f"{bleu_1_score}\t{bleu_2_score}\t{bleu_3_score}\t{bleu_4_score}\t{s}\t{_prediction}\t{_target}\t{attentions}\t{fixations}")
    if bleu == "sacrebleu":
        print(f"avg_val_loss {total_val_loss/len(val_pairs):.4f} avg_val_bleu {total_val_bleu/len(val_pairs):.2f}")
    elif bleu == "nltk":
        print(f"avg_val_loss {total_val_loss/len(val_pairs):.4f} avg_val_bleu_1 {total_val_bleu_1/len(val_pairs):.2f} avg_val_bleu_2 {total_val_bleu_2/len(val_pairs):.2f} avg_val_bleu_3 {total_val_bleu_3/len(val_pairs):.2f} avg_val_bleu_4 {total_val_bleu_4/len(val_pairs):.2f}")
@main.command()
@click.option(
    "-c",
    "--corpus",
    "corpus_name",
    required=True,
    type=click.Choice(sorted(["wiki", "qqp", "qqp_kag", "msrpc", "qqp_paws", "sentiment", "stanford", "stanford_sent", "tamil", "compression"])),
 )
@click.option("-w", "--model_weights", required=True)
@click.option("-s", "--sentence_statistics", is_flag=True)
@click.option("-b", "--bleu", type=click.Choice(["sacrebleu", "nltk"]), required=True)
 def test(corpus_name, model_weights, sentence_statistics, bleu):
    corpus, word2index, index2word = load_corpus(corpus_name, ["test"])
    test_pairs = corpus["test"]
    if model_weights is not None:
        logger.info("loading model checkpoint")
        checkpoint = torch.load(model_weights, map_location=config.DEV)
        if "word2index" in checkpoint:
            weights = checkpoint["weights"]
            word2index = checkpoint["word2index"]
            index2word = {i: w for w, i in word2index.items()}
        else:
            asdf
    network = init_network(word2index)
    if model_weights is not None:
        # remove the embedding layer before loading
        weights = {k: v for k, v in weights.items() if not "embedding" in k}
        # make a new output layer to match the weights from the checkpoint
        # we cannot remove it like we did with the embedding layers because
        # unlike those the output layer actually contains learned parameters
        vocab_size, hidden_size = weights["par_dec.out.weight"].shape
        network.par_dec.out = nn.Linear(hidden_size, vocab_size).to(config.DEV)
        # actually load the parameters
        network.load_state_dict(weights, strict=False)
    loss_fn = nn.CrossEntropyLoss()
    test_batch_iter = utils.pair_iter(pairs=test_pairs, word2index=word2index, shuffle=False, shuffle_pairs=False)
    total_test_loss = 0
    if bleu == "sacrebleu":
        total_test_bleu = 0
    elif bleu == "nltk":
        total_test_bleu_1 = 0
        total_test_bleu_2 = 0
        total_test_bleu_3 = 0
        total_test_bleu_4 = 0
    network.eval()
    for i, batch in enumerate(test_batch_iter, 1):
        input, target = batch
        prediction, words, attentions, fixations = network(input, target)
        loss = loss_fn(
            torch.stack(prediction).squeeze(1)[: target[0].shape[0], :], target[0]
        )
        total_test_loss += loss.item()
        _prediction = [index2word[_x] for _x in torch.argmax(torch.stack(prediction).squeeze(1)[: target[0].shape[0], :], -1).tolist()]
        _target = [index2word[_x] for _x in target[0].tolist()]
        if bleu == "sacrebleu":
            bleu_score = sacrebleu.sentence_bleu(" ".join(_prediction), " ".join( _target)).score
            total_test_bleu += bleu_score
        elif bleu == "nltk":
            bleu_1_score = utils.bleu(target[0].tolist(), torch.argmax(torch.stack(prediction).squeeze(1)[: target[0].shape[0], :], -1).tolist(), n=1)
            bleu_2_score = utils.bleu(target[0].tolist(), torch.argmax(torch.stack(prediction).squeeze(1)[: target[0].shape[0], :], -1).tolist(), n=2)
            bleu_3_score = utils.bleu(target[0].tolist(), torch.argmax(torch.stack(prediction).squeeze(1)[: target[0].shape[0], :], -1).tolist(), n=3)
            bleu_4_score = utils.bleu(target[0].tolist(), torch.argmax(torch.stack(prediction).squeeze(1)[: target[0].shape[0], :], -1).tolist(), n=4)
            total_test_bleu_1 += bleu_1_score
            total_test_bleu_2 += bleu_2_score
            total_test_bleu_3 += bleu_3_score
            total_test_bleu_4 += bleu_4_score
        if sentence_statistics:
            s = [index2word[x] for x in input[0].detach().cpu().tolist()]
            attentions = attentions.detach().cpu().squeeze().tolist()
            fixations = fixations.detach().cpu().squeeze().tolist()
            if bleu == "sacrebleu":
                print(f"{bleu_score}\t{s}\t{_prediction}\t{attentions}\t{fixations}")
            elif bleu == "nltk":
                print(f"{bleu_1_score}\t{bleu_2_score}\t{bleu_3_score}\t{bleu_4_score}\t{s}\t{_prediction}\t{attentions}\t{fixations}")
    if bleu == "sacrebleu":
        print(f"avg_test_loss {total_test_loss/len(test_pairs):.4f} avg_test_bleu {total_test_bleu/len(test_pairs):.2f}")
    elif bleu == "nltk":
        print(f"avg_test_loss {total_test_loss/len(test_pairs):.4f} avg_test_bleu_1 {total_test_bleu_1/len(test_pairs):.2f} avg_test_bleu_2 {total_test_bleu_2/len(test_pairs):.2f} avg_test_bleu_3 {total_test_bleu_3/len(test_pairs):.2f} avg_test_bleu_4 {total_test_bleu_4/len(test_pairs):.2f}")
@main.command()
@click.option(
    "-c",
    "--corpus",
    "corpus_name",
    required=True,
    type=click.Choice(sorted(["wiki", "qqp", "qqp_kag", "msrpc", "qqp_paws", "sentiment", "stanford", "stanford_sent", "tamil", "compression"])),
 )
@click.option("-w", "--model_weights", required=False)
 def predict(corpus_name, model_weights):
    corpus, word2index, index2word = load_corpus(corpus_name, ["val"])
    test_pairs = corpus["val"]
    if model_weights is not None:
        logger.info("loading model checkpoint")
        checkpoint = torch.load(model_weights, map_location=config.DEV)
        if "word2index" in checkpoint:
            weights = checkpoint["weights"]
            word2index = checkpoint["word2index"]
            index2word = {i: w for w, i in word2index.items()}
        else:
            asdf
    network = init_network(word2index)
    logger.info(f"vocab size {len(word2index)}")
    if model_weights is not None:
        # remove the embedding layer before loading
        weights = {k: v for k, v in weights.items() if not "embedding" in k}
        # make a new output layer to match the weights from the checkpoint
        # we cannot remove it like we did with the embedding layers because
        # unlike those the output layer actually contains learned parameters
        vocab_size, hidden_size = weights["par_dec.out.weight"].shape
        network.par_dec.out = nn.Linear(hidden_size, vocab_size).to(config.DEV)
        # actually load the parameters
        network.load_state_dict(weights, strict=False)
    test_batch_iter = utils.pair_iter(pairs=test_pairs, word2index=word2index, shuffle=False, shuffle_pairs=False)
    network.eval()
    for i, batch in enumerate(test_batch_iter, 1):
        input, target = batch
        prediction, words, attentions, fixations = network(input, target)
        _prediction = [index2word[_x] for _x in torch.argmax(torch.stack(prediction).squeeze(1), -1).tolist()]
        s = [index2word[x] for x in input[0].detach().cpu().tolist()]
        attentions = attentions.detach().cpu().squeeze().tolist()
        fixations = fixations.detach().cpu().squeeze().tolist()
        print(f"{s}\t{_prediction}\t{attentions}\t{fixations}")
@main.command()
@click.option("-w", "--model_weights", required=True)
@click.argument("path")
 def predict_file(model_weights, path):
    logger.info("loading sentences")
    sentences = []
    lang = corpora.Lang("pred")
    with open(path) as h:
        for line in h:
            line = line.strip()
            if line:
                sentence = line.split(" ")
                lang.add_sentence(sentence)
                sentences.append(sentence)
    word2index = lang.word2index
    index2word = {i: w for w, i in word2index.items()}
    logger.info(f"{len(sentences)} sentences loaded")
    logger.info("loading model checkpoint")
    checkpoint = torch.load(model_weights, map_location=config.DEV)
    if "word2index" in checkpoint:
        weights = checkpoint["weights"]
        word2index = checkpoint["word2index"]
        index2word = {i: w for w, i in word2index.items()}
    else:
        asdf
    network = init_network(word2index)
    logger.info(f"vocab size {len(word2index)}")
    # remove the embedding layer before loading
    weights = {k: v for k, v in weights.items() if not "embedding" in k}
    # make a new output layer to match the weights from the checkpoint
    # we cannot remove it like we did with the embedding layers because
    # unlike those the output layer actually contains learned parameters
    vocab_size, hidden_size = weights["par_dec.out.weight"].shape
    network.par_dec.out = nn.Linear(hidden_size, vocab_size).to(config.DEV)
    # actually load the parameters
    network.load_state_dict(weights, strict=False)
    debug_sentence_iter = utils.sent_iter(sentences, word2index=word2index)
    network.eval()
    for i, input in enumerate(debug_sentence_iter, 1):
        prediction, words, attentions, fixations = network(input)
        _prediction = [index2word[_x] for _x in torch.argmax(torch.stack(prediction).squeeze(1), -1).tolist()]
        s = [index2word[x] for x in input[0].detach().cpu().tolist()]
        attentions = attentions.detach().cpu().squeeze().tolist()
        fixations = fixations.detach().cpu().squeeze().tolist()
        print(f"{s}\t{_prediction}\t{attentions}\t{fixations}")
 if __name__ == "__main__":
    main()
--- a/joint_paraphrase_model/requirements.txt
+++ b/joint_paraphrase_model/requirements.txt
@ -0,0 +1,19 @@
 click==7.1.2
 cycler==0.10.0
 dataclasses==0.6
 future==0.18.2
 joblib==0.17.0
 kiwisolver==1.3.1
 matplotlib==3.3.3
 nltk==3.5
 numpy==1.19.4
 Pillow==8.0.1
 portalocker==2.0.0
 pyparsing==2.4.7
 python-dateutil==2.8.1
 regex==2020.11.13
 sacrebleu==1.4.6
 six==1.15.0
 torch==1.7.0
 tqdm==4.54.1
 typing-extensions==3.7.4.3
--- a/joint_paraphrase_model/utils/long_sentence_split.py
+++ b/joint_paraphrase_model/utils/long_sentence_split.py
@ -0,0 +1,43 @@
 import os
 import sys
 import click
 def read(path):
    with open(path) as h:
        for line in h:
            line = line.strip()
            try:
                b, s, p, a, f = line.split("\t")
            except:
                print(f"skipping line {line}", file=sys.stderr)
                continue
            else:
                yield b, s, p, a, f
@click.command()
@click.argument("path")
 def main(path):
    data = list(read(path))
    avg_len = sum(len(x[1]) for x in data)/len(data)
    filtered_data = []
    filtered_data2 = []
    fname, ext = os.path.splitext(path)
    ext = f".{ext}" if ext else ext
    with open(f"{fname}_long{ext}", "w") as lh, open(f"{fname}_short{ext}", "w") as sh:
        for x in data:
            if len(x[1]) > avg_len:
                lh.write("\t".join(x))
                lh.write("\n")
            else:
                sh.write("\t".join(x))
                sh.write("\n")
    print(f"avg sentence length {avg_len}")
 if __name__ == "__main__":
    main()
--- a/joint_paraphrase_model/utils/long_sentence_stats.py
+++ b/joint_paraphrase_model/utils/long_sentence_stats.py
@ -0,0 +1,40 @@
 import sys
 import click
 def read(path):
    with open(path) as h:
        for line in h:
            line = line.strip()
            try:
                b, s, p, *_ = line.split("\t")
            except:
                print(f"skipping line {line}", file=sys.stderr)
                continue
            else:
                yield float(b), s, p
@click.command()
@click.argument("path")
 def main(path):
    data = list(read(path))
    avg_len = sum(len(x[1]) for x in data)/len(data)
    filtered_data = []
    filtered_data2 = []
    for x in data:
        if len(x[1]) > avg_len:
            filtered_data.append(x)
        else:
            filtered_data2.append(x)
    print(f"avg sentence length {avg_len}")
    print(f"long sentences {len(filtered_data)}")
    print(f"short sentences {len(filtered_data2)}")
    print(f"total bleu {sum(x[0] for x in data)/len(data)}")
    print(f"longest bleu {sum(x[0] for x in filtered_data)/len(filtered_data)}")
    print(f"shortest bleu {sum(x[0] for x in filtered_data2)/len(filtered_data2)}")
 if __name__ == "__main__":
    main()
--- a/joint_paraphrase_model/utils/plot_attention.py
+++ b/joint_paraphrase_model/utils/plot_attention.py
@ -0,0 +1,64 @@
 import ast
 import os
 import pathlib
 import text_attention
 import click
 import matplotlib.pyplot as plt
 plt.switch_backend("agg")
 import matplotlib.ticker as ticker
 import numpy as np
 import tqdm
 def plot_attention(input_sentence, output_words, attentions, path):
    # Set up figure with colorbar
    attentions = np.array(attentions)[:,:len(input_sentence)]
    fig = plt.figure()
    ax = fig.add_subplot(111)
    cax = ax.matshow(attentions, cmap="bone")
    fig.colorbar(cax)
    # Set up axes
    ax.set_xticklabels([""] + input_sentence + ["<__EOS__>"], rotation=90)
    ax.set_yticklabels([""] + output_words)
    # Show label at every tick
    ax.xaxis.set_major_locator(ticker.MultipleLocator(1))
    ax.yaxis.set_major_locator(ticker.MultipleLocator(1))
    plt.savefig(f"{path}.pdf")
    plt.close()
 def parse(p):
    with open(p) as h:
        for line in h:
            if not line or line.startswith("#"):
                continue
            _sentence, _prediction, _attention, _fixations = line.strip().split("\t")
            try:
                sentence = ast.literal_eval(_sentence)
                prediction = ast.literal_eval(_prediction)
                attention = ast.literal_eval(_attention)
            except:
                continue
            yield sentence, prediction, attention
@click.command()
@click.argument("path", nargs=-1, required=True)
 def main(path):
    for p in tqdm.tqdm(path):
        out_dir = os.path.splitext(p)[0]
        if out_dir == path:
            out_dir = f"{out_dir}_"
        pathlib.Path(out_dir).mkdir(exist_ok=True)
        for i, spa in enumerate(parse(p)):
            plot_attention(*spa, path=os.path.join(out_dir, str(i)))
 if __name__ == "__main__":
    main()
--- a/joint_paraphrase_model/utils/text_attention.py
+++ b/joint_paraphrase_model/utils/text_attention.py
@ -0,0 +1,70 @@
 # -*- coding: utf-8 -*-
 # @Author: Jie Yang
 # @Date:   2019-03-29 16:10:23
 # @Last Modified by:   Jie Yang,     Contact: jieynlp@gmail.com
 # @Last Modified time: 2019-04-12 09:56:12
 ## convert the text/attention list to latex code, which will further generates the text heatmap based on attention weights.
 import numpy as np
 latex_special_token = ["!@#$%^&*()"]
 def generate(text_list, attention_list, latex_file, color='red', rescale_value = False):
 	assert(len(text_list) == len(attention_list))
 	if rescale_value:
 		attention_list = rescale(attention_list)
 	word_num = len(text_list)
 	text_list = clean_word(text_list)
 	with open(latex_file,'w') as f:
 		f.write(r'''\documentclass[varwidth]{standalone}
 \special{papersize=210mm,297mm}
 \usepackage{color}
 \usepackage{tcolorbox}
 \usepackage{CJK}
 \usepackage{adjustbox}
 \tcbset{width=0.9\textwidth,boxrule=0pt,colback=red,arc=0pt,auto outer arc,left=0pt,right=0pt,boxsep=5pt}
 \begin{document}
 \begin{CJK*}{UTF8}{gbsn}'''+'\n')
 		string = r'''{\setlength{\fboxsep}{0pt}\colorbox{white!0}{\parbox{0.9\textwidth}{'''+"\n"
 		for idx in range(word_num):
 			string += "\\colorbox{%s!%s}{"%(color, attention_list[idx])+"\\strut " + text_list[idx]+"} "
 		string += "\n}}}"
 		f.write(string+'\n')
 		f.write(r'''\end{CJK*}
 \end{document}''')
 def rescale(input_list):
 	the_array = np.asarray(input_list)
 	the_max = np.max(the_array)
 	the_min = np.min(the_array)
 	rescale = (the_array - the_min)/(the_max-the_min)*100
 	return rescale.tolist()
 def clean_word(word_list):
 	new_word_list = []
 	for word in word_list:
 		for latex_sensitive in ["\\", "%", "&", "^", "#", "_",  "{", "}"]:
 			if latex_sensitive in word:
 				word = word.replace(latex_sensitive, '\\'+latex_sensitive)
 		new_word_list.append(word)
 	return new_word_list
 if __name__ == '__main__':
 	## This is a demo:
 	sent = '''the USS Ronald Reagan - an aircraft carrier docked in Japan - during his tour of the region, vowing to "defeat any attack and meet any use of conventional or nuclear weapons with an overwhelming and effective American response".
 North Korea and the US have ratcheted up tensions in recent weeks and the movement of the strike group had raised the question of a pre-emptive strike by the US.
 On Wednesday, Mr Pence described the country as the "most dangerous and urgent threat to peace and security" in the Asia-Pacific.'''
 	sent = '''我 回忆 起 我 曾经 在 大学 年代 ， 我们 经常 喜欢 玩 “ Hawaii guitar ” 。 说起 Guitar ， 我 想起 了 西游记 里 的 琵琶精 。
 	今年 下半年 ， 中 美 合拍 的 西游记 即将 正式 开机 ， 我 继续 扮演 美猴王 孙悟空 ， 我 会 用 美猴王 艺术 形象 努力 创造 一 个 正能量 的 形象 ， 文 体 两 开花 ， 弘扬 中华 文化 ， 希望 大家 能 多多 关注 。'''
 	words = sent.split()
 	word_num = len(words)
 	attention = [(x+1.)/word_num*100 for x in range(word_num)]
 	import random
 	random.seed(42)
 	random.shuffle(attention)
 	color = 'red'
 	generate(words, attention, "sample.tex", color)
--- a/joint_sentence_compression_model/.gitignore
+++ b/joint_sentence_compression_model/.gitignore
@ -0,0 +1,428 @@
 # Created by https://www.toptal.com/developers/gitignore/api/python,latex
 # Edit at https://www.toptal.com/developers/gitignore?templates=python,latex
 ### LaTeX ###
 ## Core latex/pdflatex auxiliary files:
 *.aux
 *.lof
 *.log
 *.lot
 *.fls
 *.out
 *.toc
 *.fmt
 *.fot
 *.cb
 *.cb2
 .*.lb
 ## Intermediate documents:
 *.dvi
 *.xdv
 *-converted-to.*
 # these rules might exclude image files for figures etc.
 # *.ps
 # *.eps
 # *.pdf
 ## Generated if empty string is given at "Please type another file name for output:"
 .pdf
 ## Bibliography auxiliary files (bibtex/biblatex/biber):
 *.bbl
 *.bcf
 *.blg
 *-blx.aux
 *-blx.bib
 *.run.xml
 ## Build tool auxiliary files:
 *.fdb_latexmk
 *.synctex
 *.synctex(busy)
 *.synctex.gz
 *.synctex.gz(busy)
 *.pdfsync
 ## Build tool directories for auxiliary files
 # latexrun
 latex.out/
 ## Auxiliary and intermediate files from other packages:
 # algorithms
 *.alg
 *.loa
 # achemso
 acs-*.bib
 # amsthm
 *.thm
 # beamer
 *.nav
 *.pre
 *.snm
 *.vrb
 # changes
 *.soc
 # comment
 *.cut
 # cprotect
 *.cpt
 # elsarticle (documentclass of Elsevier journals)
 *.spl
 # endnotes
 *.ent
 # fixme
 *.lox
 # feynmf/feynmp
 *.mf
 *.mp
 *.t[1-9]
 *.t[1-9][0-9]
 *.tfm
 #(r)(e)ledmac/(r)(e)ledpar
 *.end
 *.?end
 *.[1-9]
 *.[1-9][0-9]
 *.[1-9][0-9][0-9]
 *.[1-9]R
 *.[1-9][0-9]R
 *.[1-9][0-9][0-9]R
 *.eledsec[1-9]
 *.eledsec[1-9]R
 *.eledsec[1-9][0-9]
 *.eledsec[1-9][0-9]R
 *.eledsec[1-9][0-9][0-9]
 *.eledsec[1-9][0-9][0-9]R
 # glossaries
 *.acn
 *.acr
 *.glg
 *.glo
 *.gls
 *.glsdefs
 *.lzo
 *.lzs
 # uncomment this for glossaries-extra (will ignore makeindex's style files!)
 # *.ist
 # gnuplottex
 *-gnuplottex-*
 # gregoriotex
 *.gaux
 *.gtex
 # htlatex
 *.4ct
 *.4tc
 *.idv
 *.lg
 *.trc
 *.xref
 # hyperref
 *.brf
 # knitr
 *-concordance.tex
 # TODO Comment the next line if you want to keep your tikz graphics files
 *.tikz
 *-tikzDictionary
 # listings
 *.lol
 # luatexja-ruby
 *.ltjruby
 # makeidx
 *.idx
 *.ilg
 *.ind
 # minitoc
 *.maf
 *.mlf
 *.mlt
 *.mtc[0-9]*
 *.slf[0-9]*
 *.slt[0-9]*
 *.stc[0-9]*
 # minted
 _minted*
 *.pyg
 # morewrites
 *.mw
 # nomencl
 *.nlg
 *.nlo
 *.nls
 # pax
 *.pax
 # pdfpcnotes
 *.pdfpc
 # sagetex
 *.sagetex.sage
 *.sagetex.py
 *.sagetex.scmd
 # scrwfile
 *.wrt
 # sympy
 *.sout
 *.sympy
 sympy-plots-for-*.tex/
 # pdfcomment
 *.upa
 *.upb
 # pythontex
 *.pytxcode
 pythontex-files-*/
 # tcolorbox
 *.listing
 # thmtools
 *.loe
 # TikZ & PGF
 *.dpth
 *.md5
 *.auxlock
 # todonotes
 *.tdo
 # vhistory
 *.hst
 *.ver
 # easy-todo
 *.lod
 # xcolor
 *.xcp
 # xmpincl
 *.xmpi
 # xindy
 *.xdy
 # xypic precompiled matrices and outlines
 *.xyc
 *.xyd
 # endfloat
 *.ttt
 *.fff
 # Latexian
 TSWLatexianTemp*
 ## Editors:
 # WinEdt
 *.bak
 *.sav
 # Texpad
 .texpadtmp
 # LyX
 *.lyx~
 # Kile
 *.backup
 # gummi
 .*.swp
 # KBibTeX
 *~[0-9]*
 # TeXnicCenter
 *.tps
 # auto folder when using emacs and auctex
 ./auto/*
 *.el
 # expex forward references with \gathertags
 *-tags.tex
 # standalone packages
 *.sta
 # Makeindex log files
 *.lpz
 # REVTeX puts footnotes in the bibliography by default, unless the nofootinbib
 # option is specified. Footnotes are the stored in a file with suffix Notes.bib.
 # Uncomment the next line to have this generated file ignored.
 #*Notes.bib
 ### LaTeX Patch ###
 # LIPIcs / OASIcs
 *.vtc
 # glossaries
 *.glstex
 ### Python ###
 # Byte-compiled / optimized / DLL files
 __pycache__/
 *.py[cod]
 *$py.class
 # C extensions
 *.so
 # Distribution / packaging
 .Python
 build/
 develop-eggs/
 dist/
 downloads/
 eggs/
 .eggs/
 lib/
 lib64/
 parts/
 sdist/
 var/
 wheels/
 pip-wheel-metadata/
 share/python-wheels/
 *.egg-info/
 .installed.cfg
 *.egg
 MANIFEST
 # PyInstaller
 #  Usually these files are written by a python script from a template
 #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 *.manifest
 *.spec
 # Installer logs
 pip-log.txt
 pip-delete-this-directory.txt
 # Unit test / coverage reports
 htmlcov/
 .tox/
 .nox/
 .coverage
 .coverage.*
 .cache
 nosetests.xml
 coverage.xml
 *.cover
 *.py,cover
 .hypothesis/
 .pytest_cache/
 # Translations
 *.mo
 *.pot
 # Django stuff:
 local_settings.py
 db.sqlite3
 db.sqlite3-journal
 # Flask stuff:
 instance/
 .webassets-cache
 # Scrapy stuff:
 .scrapy
 # Sphinx documentation
 docs/_build/
 # PyBuilder
 target/
 # Jupyter Notebook
 .ipynb_checkpoints
 # IPython
 profile_default/
 ipython_config.py
 # pyenv
 .python-version
 # pipenv
 #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 #   install all needed dependencies.
 #Pipfile.lock
 # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 __pypackages__/
 # Celery stuff
 celerybeat-schedule
 celerybeat.pid
 # SageMath parsed files
 *.sage.py
 # Environments
 .env
 .venv
 env/
 venv/
 ENV/
 env.bak/
 venv.bak/
 # Spyder project settings
 .spyderproject
 .spyproject
 # Rope project settings
 .ropeproject
 # mkdocs documentation
 /site
 # mypy
 .mypy_cache/
 .dmypy.json
 dmypy.json
 # Pyre type checker
 .pyre/
 # pytype static type analyzer
 .pytype/
 # End of https://www.toptal.com/developers/gitignore/api/python,latex
--- a/joint_sentence_compression_model/README.md
+++ b/joint_sentence_compression_model/README.md
@ -0,0 +1,3 @@
 # joint_sentence_compression
 joint training for sentence compression -- neurips submission
--- a/joint_sentence_compression_model/config.py
+++ b/joint_sentence_compression_model/config.py
@ -0,0 +1,39 @@
 import os
 import torch
 # general
 DEV = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 PAD = "<__PAD__>"
 UNK = "<__UNK__>"
 NOFIX = "<__NOFIX__>"
 SOS = "<__SOS__>"
 EOS = "<__EOS__>"
 batch_size = 1
 teacher_forcing_ratio = 0.5
 embedding_dim = 300
 fix_hidden_dim = 128
 sem_hidden_dim = 1024
 fix_dropout = 0.5
 par_dropout = 0.2
 _fix_learning_rate = 0.00001
 _par_learning_rate = 0.0001
 learning_rate = _par_learning_rate 
 fix_momentum = 0.9
 par_momentum = 0.0
 max_length = 851
 epochs = 5
 # paths
 data_path = "./data"
 emb_path = os.path.join(data_path, "Google_word2vec/GoogleNews-vectors-negative300.bin")
 glove_path = "glove.840B.300d.txt"
 google_path = os.path.join(data_path, "datasets/sentence-compression/data")
 google_train_path = os.path.join(google_path, "train_mask_token.tsv")
 google_dev_path = os.path.join(google_path, "dev_mask_token.tsv")
 google_test_path = os.path.join(google_path, "test_mask_token.tsv")
--- a/joint_sentence_compression_model/data
+++ b/joint_sentence_compression_model/data
@ -0,0 +1 @@
 /netpool/work/gpu-2/users/soodea/
--- a/joint_sentence_compression_model/glove.840B.300d.txt
+++ b/joint_sentence_compression_model/glove.840B.300d.txt
@ -0,0 +1 @@
 /netpool/work/gpu-2/users/soodea/datasets/glove/glove.840B.300d.txt
--- a/joint_sentence_compression_model/libs/init.py
+++ b/joint_sentence_compression_model/libs/init.py
--- a/joint_sentence_compression_model/libs/corpora.py
+++ b/joint_sentence_compression_model/libs/corpora.py
@ -0,0 +1,97 @@
 import logging
 import config
 def tokenize(sent):
    return sent.split(" ")
 class Lang:
    """Represents the vocabulary
    """
    def __init__(self, name):
        self.name = name
        self.word2index = {
            config.PAD: 0,
            config.UNK: 1,
        }
        self.word2count = {}
        self.index2word = {
            0: config.PAD,
            1: config.UNK,
        }
        self.n_words = 2
    def add_sentence(self, sentence):
        assert isinstance(
            sentence, (list, tuple)
        ), "input to add_sentence must be tokenized"
        for word in sentence:
            self.add_word(word)
    def add_word(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.word2count[word] = 1
            self.index2word[self.n_words] = word
            self.n_words += 1
        else:
            self.word2count[word] += 1
    def __add__(self, other):
        """Returns a new Lang object containing the vocabulary from this and
        the other Lang object
        """
        new_lang = Lang(f"{self.name}_{other.name}")
        # Add vocabulary from both Langs
        for word in self.word2count.keys():
            new_lang.add_word(word)
        for word in other.word2count.keys():
            new_lang.add_word(word)
        # Fix the counts on the new one
        for word in new_lang.word2count.keys():
            new_lang.word2count[word] = self.word2count.get(
                word, 0
            ) + other.word2count.get(word, 0)
        return new_lang
 def load_google(split, max_len=None):
    """Load the Google Sentence Compression Dataset"""
    logger = logging.getLogger(f"{__name__}.load_compression")
    lang = Lang("compression")
    if split == "train":
        path = config.google_train_path
    elif split == "val":
        path = config.google_dev_path 
    elif split == "test":
        path = config.google_test_path
    logger.info("loading %s from %s" % (split, path))
    data = []
    sent = []
    mask = []
    with open(path) as handle:
        for line in handle:
            line = line.strip()
            if line:
                w, d = line.split("\t")
                sent.append(w)
                mask.append(int(d))
            else:
                if sent and (max_len is None or len(sent) <= max_len):
                    data.append([sent, mask])
                    lang.add_sentence(sent)
                sent = []
                mask = []
        if sent:
            data.append([tuple(sent), tuple(mask)])
            lang.add_sentence(sent)
    return data, lang
--- a/joint_sentence_compression_model/libs/fixation_generation/init.py
+++ b/joint_sentence_compression_model/libs/fixation_generation/init.py
@ -0,0 +1 @@
 from .main import *
--- a/joint_sentence_compression_model/libs/fixation_generation/main.py
+++ b/joint_sentence_compression_model/libs/fixation_generation/main.py
@ -0,0 +1,125 @@
 from collections import OrderedDict
 import logging
 import sys
 from .self_attention import Transformer
 import numpy as np
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 from torch.nn.utils.rnn import pack_sequence, pack_padded_sequence, pad_packed_sequence, pad_sequence
 def random_embedding(vocab_size, embedding_dim):
    pretrain_emb = np.empty([vocab_size, embedding_dim])
    scale = np.sqrt(3.0 / embedding_dim)
    for index in range(vocab_size):
        pretrain_emb[index, :] = np.random.uniform(-scale, scale, [1, embedding_dim])
    return pretrain_emb
 def neg_log_likelihood_loss(outputs, batch_label, batch_size, seq_len):
    outputs = outputs.view(batch_size * seq_len, -1)
    score = F.log_softmax(outputs, 1)
    loss = nn.NLLLoss(ignore_index=0, size_average=False)(
        score, batch_label.view(batch_size * seq_len)
    )
    loss = loss / batch_size
    _, tag_seq = torch.max(score, 1)
    tag_seq = tag_seq.view(batch_size, seq_len)
    return loss, tag_seq
 def mse_loss(outputs, batch_label, batch_size, seq_len, word_seq_length):
    score = torch.sigmoid(outputs)
    mask = torch.zeros_like(score)
    for i, v in enumerate(word_seq_length):
        mask[i, 0:v] = 1
    score = score * mask
    loss = nn.MSELoss(reduction="sum")(
        score.view(batch_size, seq_len), batch_label.view(batch_size, seq_len)
    )
    loss = loss / batch_size
    return loss, score.view(batch_size, seq_len)
 class Network(nn.Module):
    def __init__(
        self,
        embedding_type,
        vocab_size,
        embedding_dim,
        dropout,
        hidden_dim,
        embeddings=None,
        attention=True,
    ):
        super().__init__()
        self.logger = logging.getLogger(f"{__name__}")
        self.attention = attention
        prelayers = OrderedDict()
        postlayers = OrderedDict()
        if embedding_type in ("w2v", "glove"):
            if embeddings is not None:
                prelayers["embedding_layer"] = nn.Embedding.from_pretrained(embeddings, freeze=True)
            else:
                prelayers["embedding_layer"] = nn.Embedding(vocab_size, embedding_dim)
            prelayers["embedding_dropout_layer"] = nn.Dropout(dropout)
            embedding_dim = 300
        elif embedding_type == "bert":
            embedding_dim = 768
        self.lstm = BiLSTM(embedding_dim, hidden_dim // 2, num_layers=1)
        postlayers["lstm_dropout_layer"] = nn.Dropout(dropout)
        if self.attention:
            postlayers["attention_layer"] = Transformer(
                d_model=hidden_dim, n_heads=4, n_layers=1
            )
        postlayers["ff_layer"] = nn.Linear(hidden_dim, hidden_dim // 2)
        postlayers["ff_activation"] = nn.ReLU()
        postlayers["output_layer"] = nn.Linear(hidden_dim // 2, 1)
        self.logger.info(f"prelayers: {prelayers.keys()}")
        self.logger.info(f"postlayers: {postlayers.keys()}")
        self.pre = nn.Sequential(prelayers)
        self.post = nn.Sequential(postlayers)
    def forward(self, x, word_seq_length):
        x = self.pre(x)
        x = self.lstm(x, word_seq_length)
        output = []
        for _x, l in zip(x.transpose(1, 0), word_seq_length):
            output.append(self.post(_x[:l].unsqueeze(0))[0])
        return pad_sequence(output, batch_first=True)
 class BiLSTM(nn.Module):
    def __init__(self, embedding_dim, lstm_hidden, num_layers):
        super().__init__()
        self.net = nn.LSTM(
            input_size=embedding_dim,
            hidden_size=lstm_hidden,
            num_layers=num_layers,
            batch_first=True,
            bidirectional=True,
        )
    def forward(self, x, word_seq_length):
        packed_words = pack_padded_sequence(x, word_seq_length, True, False)
        lstm_out, hidden = self.net(packed_words)
        lstm_out, _ = pad_packed_sequence(lstm_out)
        return lstm_out
--- a/joint_sentence_compression_model/libs/fixation_generation/self_attention.py
+++ b/joint_sentence_compression_model/libs/fixation_generation/self_attention.py
@ -0,0 +1,128 @@
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 import numpy as np
 import math
 class PositionalEncoding(nn.Module):
    def __init__(self, d_hid, n_position=200):
        super(PositionalEncoding, self).__init__()
        self.register_buffer('pos_table', self._get_sinusoid_encoding_table(n_position, d_hid))
    def _get_sinusoid_encoding_table(self, n_position, d_hid):
        ''' Sinusoid position encoding table '''
        def get_position_angle_vec(position):
            return [position / np.power(10000, 2 * (hid_j // 2) / d_hid) for hid_j in range(d_hid)]
        sinusoid_table = np.array([get_position_angle_vec(pos_i) for pos_i in range(n_position)])
        sinusoid_table[:, 0::2] = np.sin(sinusoid_table[:, 0::2])  # dim 2i
        sinusoid_table[:, 1::2] = np.cos(sinusoid_table[:, 1::2])  # dim 2i+1
        return torch.FloatTensor(sinusoid_table).unsqueeze(0)
    def forward(self, x):
        return x + self.pos_table[:, :x.size(1)].clone().detach()
 class AttentionLayer(nn.Module):
    def __init__(self):
        super(AttentionLayer, self).__init__()
    def forward(self, Q, K, V):
        # Q: float32:[batch_size, n_queries, d_k]
        # K: float32:[batch_size, n_keys, d_k]
        # V: float32:[batch_size, n_keys, d_v]
        dk = K.shape[-1]
        dv = V.shape[-1]
        KT = torch.transpose(K, -1, -2)
        weight_logits = torch.bmm(Q, KT) / math.sqrt(dk)
        # weight_logits: float32[batch_size, n_queries, n_keys]
        weights = F.softmax(weight_logits, dim=-1)
        # weight: float32[batch_size, n_queries, n_keys]
        return torch.bmm(weights, V)
        # return float32[batch_size, n_queries, dv]
 class MultiHeadedSelfAttentionLayer(nn.Module):
    def __init__(self, d_model, n_heads):
        super(MultiHeadedSelfAttentionLayer, self).__init__()
        self.d_model = d_model
        self.n_heads = n_heads
        print('{} {}'.format(d_model, n_heads))
        assert d_model % n_heads == 0
        self.d_k = d_model // n_heads
        self.d_v = self.d_k
        self.attention_layer = AttentionLayer()
        self.W_Qs = nn.ModuleList([
                nn.Linear(d_model, self.d_k, bias=False)
                for _ in range(n_heads)
        ])
        self.W_Ks = nn.ModuleList([
                nn.Linear(d_model, self.d_k, bias=False)
                for _ in range(n_heads)
        ])
        self.W_Vs = nn.ModuleList([
                nn.Linear(d_model, self.d_v, bias=False)
                for _ in range(n_heads)
        ])
        self.W_O = nn.Linear(d_model, d_model, bias=False)
    def forward(self, x):
        # x:float32[batch_size, sequence_length, self.d_model]
        head_outputs = []
        for W_Q, W_K, W_V in zip(self.W_Qs, self.W_Ks, self.W_Vs):
            Q = W_Q(x)
            # Q float32:[batch_size, sequence_length, self.d_k]
            K = W_K(x)
            # Q float32:[batch_size, sequence_length, self.d_k]
            V = W_V(x)
            # Q float32:[batch_size, sequence_length, self.d_v]
            head_output = self.attention_layer(Q, K, V)
            # float32:[batch_size, sequence_length, self.d_v]
            head_outputs.append(head_output)
        concatenated = torch.cat(head_outputs, dim=-1)
        # concatenated float32:[batch_size, sequence_length, self.d_model]
        out = self.W_O(concatenated)
        # out float32:[batch_size, sequence_length, self.d_model]
        return out
 class Feedforward(nn.Module):
    def __init__(self, d_model):
        super(Feedforward, self).__init__()
        self.d_model = d_model
        self.W1 = nn.Linear(d_model, d_model)
        self.W2 = nn.Linear(d_model, d_model)
    def forward(self, x):
        # x: float32[batch_size, sequence_length, d_model]
        return self.W2(torch.relu(self.W1(x)))
 class Transformer(nn.Module):
    def __init__(self, d_model, n_heads, n_layers):
        super(Transformer, self).__init__()
        self.d_model = d_model
        self.n_heads = n_heads
        self.n_layers = n_layers
        self.attention_layers = nn.ModuleList([
            MultiHeadedSelfAttentionLayer(d_model, n_heads)
            for _ in range(n_layers)
        ])
        self.ffs = nn.ModuleList([
            Feedforward(d_model)
            for _ in range(n_layers)
        ])
    def forward(self, x):
        # x: float32[batch_size, sequence_length, self.d_model]
        for attention_layer, ff in zip(self.attention_layers, self.ffs):
            attention_out = attention_layer(x)
            # attention_out: float32[batch_size, sequence_length, self.d_model]
            x = F.layer_norm(x + attention_out, x.shape[2:])
            ff_out = ff(x)
            # ff_out: float32[batch_size, sequence_length, self.d_model]
            x = F.layer_norm(x + ff_out, x.shape[2:])
        return x
--- a/joint_sentence_compression_model/libs/sentence_compression/LICENSE
+++ b/joint_sentence_compression_model/libs/sentence_compression/LICENSE
@ -0,0 +1,29 @@
 BSD 3-Clause License
 Copyright (c) 2018, Tatsuya Aoki 
 All rights reserved.
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are met:
 * Redistributions of source code must retain the above copyright notice, this
  list of conditions and the following disclaimer.
 * Redistributions in binary form must reproduce the above copyright notice,
  this list of conditions and the following disclaimer in the documentation
  and/or other materials provided with the distribution.
 * Neither the name of the copyright holder nor the names of its
  contributors may be used to endorse or promote products derived from
  this software without specific prior written permission.
 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
 FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
--- a/joint_sentence_compression_model/libs/sentence_compression/README.md
+++ b/joint_sentence_compression_model/libs/sentence_compression/README.md
@ -0,0 +1,31 @@
 # Simple Model for Sentence Compression
 3-layered BILSTM model for sentence compression, referred as Baseline in [Klerke et al., NAACL 2016](http://aclweb.org/anthology/N/N16/N16-1179.pdf).
 ## Requirements
 ### Framework
 - python (<= 3.6)
 - pytorch (<= 0.3.0)
 ### Packages
 - torchtext
 ## How to run
 ```
 ./getdata
 python main.py
 ```
 To run the scripts with gpu, use this command `python main.py --gpu-id ID`, which ID is the integer from 0 to the number of gpus what you have. 
 ## Reference
 ```
@InProceedings{klerke-goldberg-sogaard:2016:N16-1,
  author    = {Klerke, Sigrid  and  Goldberg, Yoav  and  S{\o}gaard, Anders},
  title     = {Improving sentence compression by learning to predict gaze},
  booktitle = {Proceedings of the 2016 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies},
  month     = {June},
  year      = {2016},
  address   = {San Diego, California},
  publisher = {Association for Computational Linguistics},
  pages     = {1528--1533},
  url       = {http://www.aclweb.org/anthology/N16-1179}
 }
 ```
--- a/joint_sentence_compression_model/libs/sentence_compression/init.py
+++ b/joint_sentence_compression_model/libs/sentence_compression/init.py
@ -0,0 +1 @@
 from .main import *
--- a/joint_sentence_compression_model/libs/sentence_compression/batch.py
+++ b/joint_sentence_compression_model/libs/sentence_compression/batch.py
@ -0,0 +1,95 @@
 from torchtext import data
 from const import Phase
 def create_dataset(data: dict, batch_size: int, device: int):
    train = Dataset(data[Phase.TRAIN]['tokens'],
                    data[Phase.TRAIN]['labels'],
                    vocab=None,
                    batch_size=batch_size,
                    device=device,
                    phase=Phase.TRAIN)
    dev = Dataset(data[Phase.DEV]['tokens'],
                  data[Phase.DEV]['labels'],
                  vocab=train.vocab,
                  batch_size=batch_size,
                  device=device,
                  phase=Phase.DEV)
    test = Dataset(data[Phase.TEST]['tokens'],
                   data[Phase.TEST]['labels'],
                   vocab=train.vocab,
                   batch_size=batch_size,
                   device=device,
                   phase=Phase.TEST)
    return train, dev, test
 class Dataset:
    def __init__(self,
                 tokens: list,
                 label_list: list,
                 vocab: list,
                 batch_size: int,
                 device: int,
                 phase: Phase):
        assert len(tokens) == len(label_list), \
            'the number of sentences and the number of POS/head sequences \
             should be the same length'
        self.pad_token = '<PAD>'
        # self.unk_token = '<UNK>'
        self.tokens = tokens
        self.label_list = label_list
        self.sentence_id = [[i] for i in range(len(tokens))]
        self.device = device
        self.token_field = data.Field(use_vocab=True,
                                      # unk_token=self.unk_token,
                                      pad_token=self.pad_token,
                                      batch_first=True)
        self.label_field = data.Field(use_vocab=False, pad_token=-1, batch_first=True)
        self.sentence_id_field = data.Field(use_vocab=False, batch_first=True)
        self.dataset = self._create_dataset()
        if vocab is None:
            self.token_field.build_vocab(self.tokens)
            self.vocab = self.token_field.vocab
        else:
            self.token_field.vocab = vocab
            self.vocab = vocab
        self.pad_index = self.token_field.vocab.stoi[self.pad_token]
        self._set_batch_iter(batch_size, phase)
    def get_raw_sentence(self, sentences):
        return [[self.vocab.itos[idx] for idx in sentence]
                for sentence in sentences]
    def _create_dataset(self):
        _fields = [('token', self.token_field),
                   ('label', self.label_field),
                   ('sentence_id', self.sentence_id_field)]
        return data.Dataset(self._get_examples(_fields), _fields)
    def _get_examples(self, fields: list):
        ex = []
        for sentence, label, sentence_id in zip(self.tokens, self.label_list, self.sentence_id):
            ex.append(data.Example.fromlist([sentence, label, sentence_id], fields))
        return ex
    def _set_batch_iter(self, batch_size: int, phase: Phase):
        def sort(data: data.Dataset) -> int:
            return len(getattr(data, 'token'))
        train = True if phase == Phase.TRAIN else False
        self.batch_iter = data.BucketIterator(dataset=self.dataset,
                                              batch_size=batch_size,
                                              sort_key=sort,
                                              train=train,
                                              repeat=False,
                                              device=self.device)
--- a/joint_sentence_compression_model/libs/sentence_compression/const.py
+++ b/joint_sentence_compression_model/libs/sentence_compression/const.py
@ -0,0 +1,8 @@
 from enum import Enum, unique
@unique
 class Phase(Enum):
    TRAIN = 'train'
    DEV = 'dev'
    TEST = 'test'
--- a/joint_sentence_compression_model/libs/sentence_compression/main.py
+++ b/joint_sentence_compression_model/libs/sentence_compression/main.py
@ -0,0 +1,92 @@
 import torch
 import torch.nn as nn
 from torch.autograd import Variable
 class Network(nn.Module):
    def __init__(self,
                 embeddings,
                 hidden_size: int,
                 prior,
                 device: torch.device):
        super(Network, self).__init__()
        self.device = device
        self.priors = torch.log(torch.tensor([prior, 1-prior])).to(device)
        self.hidden_size = hidden_size
        self.bilstm_layers = 3
        self.bilstm_input_size = 300
        self.bilstm_output_size = 2 * hidden_size
        self.word_emb = nn.Embedding.from_pretrained(embeddings, freeze=False)
        self.bilstm = nn.LSTM(self.bilstm_input_size,
                              self.hidden_size,
                              num_layers=self.bilstm_layers,
                              batch_first=True,
                              dropout=0.1, #ms best mod 0.1
                              bidirectional=True)
        self.dropout = nn.Dropout(p=0.35)
        if self.attention:
            self.attention_size = self.bilstm_output_size * 2
            self.u_a = nn.Linear(self.bilstm_output_size, self.bilstm_output_size)
            self.w_a = nn.Linear(self.bilstm_output_size, self.bilstm_output_size)
            self.v_a_inv = nn.Linear(self.bilstm_output_size, 1, bias=False)
            self.linear_attn = nn.Linear(self.attention_size, self.bilstm_output_size)
        self.linear = nn.Linear(self.bilstm_output_size, self.hidden_size)
        self.pred = nn.Linear(self.hidden_size, 2)
        self.softmax = nn.LogSoftmax(dim=1)
        self.criterion = nn.NLLLoss(ignore_index=-1)
    def forward(self, input_tokens, labels, fixations=None):
        loss = 0.0
        preds = []
        atts = []
        batch_size, seq_len = input_tokens.size()
        self.init_hidden(batch_size, device=self.device)
        x_i = self.word_emb(input_tokens)
        x_i = self.dropout(x_i)
        hidden, (self.h_n, self.c_n) = self.bilstm(x_i, (self.h_n, self.c_n))
        _, _, hidden_size = hidden.size()
        for i in range(seq_len):
            nth_hidden = hidden[:, i, :]
            if self.attention:
                target = nth_hidden.expand(seq_len, batch_size, -1).transpose(0, 1)
                mask = hidden.eq(target)[:, :, 0].unsqueeze(2)
                attn_weight = self.attention(hidden, target, fixations, mask)
                context_vector = torch.bmm(attn_weight.transpose(1, 2), hidden).squeeze(1)
                nth_hidden = torch.tanh(self.linear_attn(torch.cat((nth_hidden, context_vector), -1)))
                atts.append(attn_weight.detach().cpu())
            logits = self.pred(self.linear(nth_hidden))
            if not self.training:
                logits = logits + self.priors 
            output = self.softmax(logits)
            loss += self.criterion(output, labels[:, i])
            _, topi = output.topk(k=1, dim=1)
            pred = topi.squeeze(-1)
            preds.append(pred)
        preds = torch.stack(torch.cat(preds, dim=0).split(batch_size), dim=1)
        if atts:
            atts = torch.stack(torch.cat(atts, dim=0).split(batch_size), dim=1)
        return loss, preds, atts
    def attention(self, source, target, fixations=None, mask=None):
        function_g = \
            self.v_a_inv(torch.tanh(self.u_a(source) + self.w_a(target)))
        if mask is not None:
            function_g.masked_fill_(mask, -1e4)
        if fixations is not None:
            function_g = function_g*fixations
        return nn.functional.softmax(function_g, dim=1)
    def init_hidden(self, batch_size, device):
        zeros = Variable(torch.zeros(2*self.bilstm_layers, batch_size, self.hidden_size))
        self.h_n = zeros.to(device)
        self.c_n = zeros.to(device)
        return self.h_n, self.c_n
--- a/joint_sentence_compression_model/libs/sentence_compression/run.py
+++ b/joint_sentence_compression_model/libs/sentence_compression/run.py
@ -0,0 +1,183 @@
 import torch
 from torch import optim
 import tqdm
 from const import Phase
 from batch import create_dataset
 from models import Baseline
 from sklearn.metrics import classification_report
 def run(dataset_train,
        dataset_dev,
        dataset_test,
        model_type,
        word_embed_size,
        hidden_size,
        batch_size,
        device,
        n_epochs):
    if model_type == 'base':
        model = Baseline(vocab=dataset_train.vocab,
                         word_embed_size=word_embed_size,
                         hidden_size=hidden_size,
                         device=device,
                         inference=False)
    else:
        raise NotImplementedError
    model = model.to(device)
    optim_params = model.parameters()
    optimizer = optim.Adam(optim_params, lr=10**-3)
    print('start training')
    for epoch in range(n_epochs):
        train_loss, tokens, preds, golds = train(dataset_train,
                                                 model,
                                                 optimizer,
                                                 batch_size,
                                                 epoch,
                                                 Phase.TRAIN,
                                                 device)
        dev_loss, tokens, preds, golds = train(dataset_dev,
                                               model,
                                               optimizer,
                                               batch_size,
                                               epoch,
                                               Phase.DEV,
                                               device)
        logger = '\t'.join(['epoch {}'.format(epoch+1),
                            'TRAIN Loss: {:.9f}'.format(train_loss),
                            'DEV Loss: {:.9f}'.format(dev_loss)])
        # print('\r'+logger, end='')
        print(logger)
    test_loss, tokens, preds, golds = train(dataset_test,
                                            model,
                                            optimizer,
                                            batch_size,
                                            epoch,
                                            Phase.TEST,
                                            device)
    print('====', 'TEST', '=====')
    print_scores(preds, golds)
    output_results(tokens, preds, golds)
 def train(dataset,
          model,
          optimizer,
          batch_size,
          n_epoch,
          phase,
          device):
    total_loss = 0.0
    tokens = []
    preds = []
    labels = []
    if phase == Phase.TRAIN:
        model.train()
    else:
        model.eval()
    for batch in tqdm.tqdm(dataset.batch_iter):
        token = getattr(batch, 'token')
        label = getattr(batch, 'label')
        raw_sentences = dataset.get_raw_sentence(token.data.detach().cpu().numpy())
        loss, pred = \
            model(token, raw_sentences, label, phase)
        if phase == Phase.TRAIN:
            optimizer.zero_grad()
            torch.nn.utils.clip_grad_norm(model.parameters(), max_norm=5)
            loss.backward()
            optimizer.step()
        # remove PAD from input sentences/labels and results
        mask = (token != dataset.pad_index)
        length_tensor = mask.sum(1)
        length_tensor = length_tensor.data.detach().cpu().numpy()
        for index, n_tokens_in_the_sentence in enumerate(length_tensor):
            if n_tokens_in_the_sentence > 0:
                tokens.append(raw_sentences[index][:n_tokens_in_the_sentence])
                _label = label[index][:n_tokens_in_the_sentence]
                _pred = pred[index][:n_tokens_in_the_sentence]
                _label = _label.data.detach().cpu().numpy()
                _pred = _pred.data.detach().cpu().numpy()
                labels.append(_label)
                preds.append(_pred)
        total_loss += torch.mean(loss).item()
    return total_loss, tokens, preds, labels
 def read_two_cols_data(fname, max_len=None):
    data = {}
    tokens = []
    labels = []
    token = []
    label = []
    with open(fname, mode='r') as f:
        for line in f:
            line = line.strip().lower().split()
            if line:
                try:
                    _token, _label = line
                except ValueError:
                    raise
                token.append(_token)
                if _label == '0' or _label == '1':
                    label.append(int(_label))
                else:
                    if _label == 'del':
                        label.append(1)
                    else:
                        label.append(0)
            else:
                if max_len is None or len(token) <= max_len:
                    tokens.append(token)
                    labels.append(label)
                token = []
                label = []
    data['tokens'] = tokens
    data['labels'] = labels
    return data
 def load(train_path, dev_path, test_path, batch_size, max_len, device):
    train = read_two_cols_data(train_path, max_len)
    dev = read_two_cols_data(dev_path)
    test = read_two_cols_data(test_path)
    data = {Phase.TRAIN: train, Phase.DEV: dev, Phase.TEST: test}
    return create_dataset(data, batch_size=batch_size, device=device)
 def print_scores(preds, golds):
    _preds = [label for sublist in preds for label in sublist]
    _golds = [label for sublist in golds for label in sublist]
    target_names = ['not_del', 'del']
    print(classification_report(_golds, _preds, target_names=target_names, digits=5))
 def output_results(tokens, preds, golds, path='./result/sentcomp'):
    with open(path+'.original.txt', mode='w') as w, \
            open(path+'.gold.txt', mode='w') as w_gold, \
            open(path+'.pred.txt', mode='w') as w_pred:
        for _tokens, _golds, _preds in zip(tokens, golds, preds):
            for token, gold, pred in zip(_tokens, _golds, _preds):
                w.write(token + ' ')
                if gold == 0:
                    w_gold.write(token + ' ')
                # 0 -> keep, 1 -> delete
                if pred == 0:
                    w_pred.write(token + ' ')
            w.write('\n')
            w_gold.write('\n')
            w_pred.write('\n')
--- a/joint_sentence_compression_model/libs/utils.py
+++ b/joint_sentence_compression_model/libs/utils.py
@ -0,0 +1,218 @@
 import json
 import logging
 import math
 import os
 import random
 import re
 import time
 import matplotlib.pyplot as plt
 import matplotlib.ticker as ticker
 from nltk.translate.bleu_score import sentence_bleu
 import numpy as np
 import torch
 import torch.nn as nn
 import config
 plt.switch_backend("agg")
 def load_glove(vocabulary):
    logger = logging.getLogger(f"{__name__}.load_glove")
    logger.info("loading embeddings")
    try:
        with open(f"glove.cache") as h:
            cache = json.load(h)
    except:
        logger.info("cache doesn't exist")
        cache = {}
        cache[config.PAD] = [0] * 300
        cache[config.SOS] = [0] * 300
        cache[config.EOS] = [0] * 300
        cache[config.UNK] = [0] * 300
        cache[config.NOFIX] = [0] * 300
    else:
        logger.info("cache found")
    cache_miss = False
    if not set(vocabulary) <= set(cache):
        cache_miss = True
        logger.warn("cache miss, loading full embeddings")
        data = {}
        with open("glove.840B.300d.txt") as h:
            for line in h:
                word, *emb = line.strip().split()
                try:
                    data[word] = [float(x) for x in emb]
                except:
                    continue
        logger.info("finished loading full embeddings")
        for word in vocabulary:
            try:
                cache[word] = data[word]
            except KeyError:
                cache[word] = [0] * 300
        logger.info("cache updated")
    embeddings = []
    for word in vocabulary:
        embeddings.append(torch.tensor(cache[word], dtype=torch.float32))
    embeddings = torch.stack(embeddings)
    if cache_miss:
        with open(f"glove.cache", "w") as h:
            json.dump(cache, h)
        logger.info("cache saved")
    return embeddings
 def tokenize(s):
    s = s.lower().strip()
    s = re.sub(r"([.!?])", r" \1", s)
    s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
    s = s.split(" ")
    return s
 def indices_from_sentence(word2index, sentence, unknown_threshold):
    if unknown_threshold:
        return [
            word2index.get(
                word if random.random() > unknown_threshold else config.UNK,
                word2index[config.UNK],
            )
            for word in sentence
        ]
    else:
        return [
            word2index.get(word, word2index[config.UNK]) for word in sentence
        ]
 def tensor_from_sentence(word2index, sentence, unknown_threshold):
    indices = indices_from_sentence(word2index, sentence, unknown_threshold)
    return torch.tensor(indices, dtype=torch.long, device=config.DEV)
 def tensors_from_pair(word2index, pair, shuffle, unknown_threshold):
    tensors = [
        tensor_from_sentence(word2index, pair[0], unknown_threshold),
        tensor_from_sentence(word2index, pair[1], unknown_threshold),
    ]
    if shuffle:
        random.shuffle(tensors)
    return tensors
 def bleu(reference, hypothesis, n=4):
    if n < 1:
        return 0
    weights = [1/n]*n
    return sentence_bleu([reference], hypothesis, weights)
 def pair_iter(pairs, word2index, shuffle=False, shuffle_pairs=False, unknown_threshold=0.00):
    if shuffle:
        pairs = pairs.copy()
        random.shuffle(pairs)
    for pair in pairs:
        tensor1, tensor2 = tensors_from_pair(word2index, (pair[0], pair[1]), shuffle_pairs, unknown_threshold)
        yield (tensor1,), (tensor2,)
 def sent_iter(sents, word2index, batch_size, unknown_threshold=0.00):
    for i in range(len(sents)//batch_size+1):
        raw_sents = [x[0] for x in sents[i*batch_size:i*batch_size+batch_size]]
        _sents = [tensor_from_sentence(word2index, sent, unknown_threshold) for sent, target in sents[i*batch_size:i*batch_size+batch_size]]
        _targets = [torch.tensor(target, dtype=torch.long).to(config.DEV) for sent, target in sents[i*batch_size:i*batch_size+batch_size]]
        if raw_sents and _sents and _targets:
            yield(raw_sents, _sents, _targets)
 def batch_iter(pairs, word2index, batch_size, shuffle=False, unknown_threshold=0.00):
    for i in range(len(pairs) // batch_size):
        batch = pairs[i : i + batch_size]
        if len(batch) != batch_size:
            continue
        batch_tensors = [
            tensors_from_pair(word2index, (pair[0], pair[1]), shuffle, unknown_threshold)
            for pair in batch
        ]
        tensors1, tensors2 = zip(*batch_tensors)
        yield tensors1, tensors2
 def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return "%dm %ds" % (m, s)
 def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return "%s (- %s)" % (asMinutes(s), asMinutes(rs))
 def showPlot(points):
    plt.figure()
    fig, ax = plt.subplots()
    # this locator puts ticks at regular intervals
    loc = ticker.MultipleLocator(base=0.2)
    ax.yaxis.set_major_locator(loc)
    plt.plot(points)
 def showAttention(input_sentence, output_words, attentions):
    # Set up figure with colorbar
    fig = plt.figure()
    ax = fig.add_subplot(111)
    cax = ax.matshow(attentions.numpy(), cmap="bone")
    fig.colorbar(cax)
    # Set up axes
    ax.set_xticklabels([""] + input_sentence.split(" ") + ["<__EOS__>"], rotation=90)
    ax.set_yticklabels([""] + output_words)
    # Show label at every tick
    ax.xaxis.set_major_locator(ticker.MultipleLocator(1))
    ax.yaxis.set_major_locator(ticker.MultipleLocator(1))
    plt.show()
 def evaluateAndShowAttention(input_sentence):
    output_words, attentions = evaluate(encoder1, attn_decoder1, input_sentence)
    print("input =", input_sentence)
    print("output =", " ".join(output_words))
    showAttention(input_sentence, output_words, attentions)
 def save_model(model, word2index, path):
    if not path.endswith(".tar"):
        path += ".tar"
    torch.save(
        {"weights": model.state_dict(), "word2index": word2index},
        path,
    )
 def load_model(path):
    checkpoint = torch.load(path)
    return checkpoint["weights"], checkpoint["word2index"]
 def extend_vocabulary(word2index, langs):
    for lang in langs:
        for word in lang.word2index:
            if word not in word2index:
                word2index[word] = len(word2index)
    return word2index
--- a/joint_sentence_compression_model/main.py
+++ b/joint_sentence_compression_model/main.py
@ -0,0 +1,495 @@
 import logging
 import os
 import pathlib
 import random
 import re
 import sys
 import click
 import sacrebleu
 import torch
 import torch.nn as nn
 import tqdm
 import config
 from libs import corpora
 from libs import utils
 from libs.fixation_generation import Network as FixNN
 from libs.sentence_compression import Network as ComNN
 from sklearn.metrics import classification_report, precision_recall_fscore_support
 cwd = os.path.dirname(__file__)
 logger = logging.getLogger("main")
 class Network(nn.Module):
    def __init__(
        self, word2index, embeddings, prior,
    ):
        super().__init__()
        self.logger = logging.getLogger(f"{__name__}")
        self.word2index = word2index
        self.index2word = {i: k for k, i in word2index.items()}
        self.fix_gen = FixNN(
            embedding_type="glove",
            vocab_size=len(word2index),
            embedding_dim=config.embedding_dim,
            embeddings=embeddings,
            dropout=config.fix_dropout,
            hidden_dim=config.fix_hidden_dim,
        )
        self.com_nn = ComNN(
            embeddings=embeddings, hidden_size=config.sem_hidden_dim, prior=prior, device=config.DEV
        )
    def forward(self, x, target, seq_lens):
        x1 = nn.utils.rnn.pad_sequence(x, batch_first=True)
        target = nn.utils.rnn.pad_sequence(target, batch_first=True, padding_value=-1)
        fixations = torch.sigmoid(self.fix_gen(x1, seq_lens))
        # fixations = None
        loss, pred, atts = self.com_nn(x1, target, fixations)
        return loss, pred, atts, fixations
 def load_corpus(corpus_name, splits):
    if not splits:
        return
    logger.info("loading corpus")
    if corpus_name == "google":
        load_fn = corpora.load_google
    corpus = {}
    langs = []
    if "train" in splits:
        train_pairs, train_lang = load_fn("train", max_len=200)
        corpus["train"] = train_pairs
        langs.append(train_lang)
    if "val" in splits:
        val_pairs, val_lang = load_fn("val")
        corpus["val"] = val_pairs
        langs.append(val_lang)
    if "test" in splits:
        test_pairs, test_lang = load_fn("test")
        corpus["test"] = test_pairs
        langs.append(test_lang)
    logger.info("creating word index")
    lang = langs[0]
    for _lang in langs[1:]:
        lang += _lang
    word2index = lang.word2index
    index2word = {i: w for w, i in word2index.items()}
    return corpus, word2index, index2word
 def init_network(word2index, prior):
    logger.info("loading embeddings")
    vocabulary = sorted(word2index.keys())
    embeddings = utils.load_glove(vocabulary)
    logger.info("initializing model")
    network = Network(word2index=word2index, embeddings=embeddings, prior=prior)
    network.to(config.DEV)
    print(f"#parameters: {sum(p.numel() for p in network.parameters())}")
    return network
@click.group(context_settings=dict(help_option_names=["-h", "--help"]))
@click.option("-v", "--verbose", count=True)
@click.option("-d", "--debug", is_flag=True)
 def main(verbose, debug):
    if verbose == 0:
        loglevel = logging.ERROR
    elif verbose == 1:
        loglevel = logging.WARN
    elif verbose >= 2:
        loglevel = logging.INFO
    if debug:
        loglevel = logging.DEBUG
    logging.basicConfig(
        format="[%(asctime)s] <%(name)s> %(levelname)s: %(message)s",
        datefmt="%d.%m. %H:%M:%S",
        level=loglevel,
    )
    logger.debug("arguments: %s" % str(sys.argv))
@main.command()
@click.option(
    "-c",
    "--corpus",
    "corpus_name",
    required=True,
    type=click.Choice(sorted(["google",])),
 )
@click.option("-m", "--model_name", required=True)
@click.option("-w", "--fixation_weights", required=False)
@click.option("-f", "--freeze_fixations", is_flag=True, default=False)
@click.option("-d", "--debug", is_flag=True, default=False)
@click.option("-p", "--prior", type=float, default=.5)
 def train(corpus_name, model_name, fixation_weights, freeze_fixations, debug, prior):
    corpus, word2index, index2word = load_corpus(corpus_name, ["train", "val"])
    train_pairs = corpus["train"]
    val_pairs = corpus["val"]
    network = init_network(word2index, prior)
    model_dir = os.path.join("models", model_name)
    logger.debug("creating model dir %s" % model_dir)
    pathlib.Path(model_dir).mkdir(parents=True)
    if fixation_weights is not None:
        logger.info("loading fixation prediction checkpoint")
        checkpoint = torch.load(fixation_weights, map_location=config.DEV)
        if "word2index" in checkpoint:
            weights = checkpoint["weights"]
        else:
            weights = checkpoint
        # remove the embedding layer before loading
        weights = {
            k: v for k, v in weights.items() if not k.startswith("pre.embedding_layer")
        }
        network.fix_gen.load_state_dict(weights, strict=False)
        if freeze_fixations:
            logger.info("freezing fixation generation network")
            for p in network.fix_gen.parameters():
                p.requires_grad = False
    optimizer = torch.optim.Adam(network.parameters(), lr=config.learning_rate)
    best_val_loss = None
    epoch = 1
    batch_size = 20
    while True:
        train_batch_iter = utils.sent_iter(
            sents=train_pairs, word2index=word2index, batch_size=batch_size
        )
        val_batch_iter = utils.sent_iter(
            sents=val_pairs, word2index=word2index, batch_size=batch_size
        )
        total_train_loss = 0
        total_val_loss = 0
        network.train()
        for i, batch in tqdm.tqdm(
            enumerate(train_batch_iter, 1), total=len(train_pairs) // batch_size + 1
        ):
            optimizer.zero_grad()
            raw_sent, sent, target = batch
            seq_lens = [len(x) for x in sent]
            loss, prediction, attention, fixations = network(sent, target, seq_lens)
            prediction = prediction.detach().cpu().numpy()
            torch.nn.utils.clip_grad_norm(network.parameters(), max_norm=5)
            loss.backward()
            optimizer.step()
            total_train_loss += loss.item()
        avg_train_loss = total_train_loss / len(train_pairs)
        val_sents = []
        val_preds = []
        val_targets = []
        network.eval()
        for i, batch in tqdm.tqdm(
            enumerate(val_batch_iter), total=len(val_pairs) // batch_size + 1
        ):
            raw_sent, sent, target = batch
            seq_lens = [len(x) for x in sent]
            loss, prediction, attention, fixations = network(sent, target, seq_lens)
            prediction = prediction.detach().cpu().numpy()
            for i, l in enumerate(seq_lens):
                val_sents.append(raw_sent[i][:l])
                val_preds.append(prediction[i][:l].tolist())
                val_targets.append(target[i][:l].tolist())
            total_val_loss += loss.item()
        avg_val_loss = total_val_loss / len(val_pairs)
        print(
            f"epoch {epoch} train_loss {avg_train_loss:.4f} val_loss {avg_val_loss:.4f}"
        )
        print(
            classification_report(
                [x for y in val_targets for x in y],
                [x for y in val_preds for x in y],
                target_names=["not_del", "del"],
                digits=5,
            )
        )
        with open(f"models/{model_name}/val_original_{epoch}.txt", "w") as oh, open(
            f"models/{model_name}/val_pred_{epoch}.txt", "w"
        ) as ph, open(f"models/{model_name}/val_gold_{epoch}.txt", "w") as gh:
            for sent, preds, golds in zip(val_sents, val_preds, val_targets):
                pred_compressed = [
                    word for word, delete in zip(sent, preds) if not delete
                ]
                gold_compressed = [
                    word for word, delete in zip(sent, golds) if not delete
                ]
                oh.write(" ".join(sent))
                ph.write(" ".join(pred_compressed))
                gh.write(" ".join(gold_compressed))
                oh.write("\n")
                ph.write("\n")
                gh.write("\n")
        if best_val_loss is None or avg_val_loss < best_val_loss:
            delta = avg_val_loss - best_val_loss if best_val_loss is not None else 0.0
            best_val_loss = avg_val_loss
            print(
                f"new best model epoch {epoch} val loss {avg_val_loss:.4f} ({delta:.4f})"
            )
        utils.save_model(
            network, word2index, f"models/{model_name}/{model_name}_{epoch}"
        )
        epoch += 1
@main.command()
@click.option(
    "-c",
    "--corpus",
    "corpus_name",
    required=True,
    type=click.Choice(sorted(["google",])),
 )
@click.option("-w", "--model_weights", required=True)
@click.option("-p", "--prior", type=float, default=.5)
@click.option("-l", "--longest", is_flag=True)
@click.option("-s", "--shortest", is_flag=True)
@click.option("-d", "--detailed", is_flag=True)
 def test(corpus_name, model_weights, prior, longest, shortest, detailed):
    if longest and shortest:
        print("longest and shortest are mutually exclusive", file=sys.stderr)
        sys.exit()
    corpus, word2index, index2word = load_corpus(corpus_name, ["test"])
    test_pairs = corpus["test"]
    model_name = os.path.basename(os.path.dirname(model_weights))
    epoch = re.search("_(\d+).tar", model_weights).group(1)
    logger.info("loading model checkpoint")
    checkpoint = torch.load(model_weights, map_location=config.DEV)
    if "word2index" in checkpoint:
        weights = checkpoint["weights"]
        word2index = checkpoint["word2index"]
        index2word = {i: w for w, i in word2index.items()}
    else:
        asdf
    network = init_network(word2index, prior)
    network.eval()
    # remove the embedding layer before loading
    # weights = {k: v for k, v in weights.items() if not "embedding" in k}
    # actually load the parameters
    network.load_state_dict(weights, strict=False)
    total_test_loss = 0
    batch_size = 20
    test_batch_iter = utils.sent_iter(
        sents=test_pairs, word2index=word2index, batch_size=batch_size
    )
    test_sents = []
    test_preds = []
    test_targets = []
    for i, batch in tqdm.tqdm(
        enumerate(test_batch_iter, 1), total=len(test_pairs) // batch_size + 1
    ):
        raw_sent, sent, target = batch
        seq_lens = [len(x) for x in sent]
        loss, prediction, attention, fixations = network(sent, target, seq_lens)
        prediction = prediction.detach().cpu().numpy()
        for i, l in enumerate(
            seq_lens
        ):
            test_sents.append(raw_sent[i][:l])
            test_preds.append(prediction[i][:l].tolist())
            test_targets.append(target[i][:l].tolist())
        total_test_loss += loss.item()
    avg_test_loss = total_test_loss / len(test_pairs)
    print(f"test_loss {avg_test_loss:.4f}")
    if longest:
        avg_len = sum(len(s) for s in test_sents)/len(test_sents)
        test_sents = list(filter(lambda x: len(x) > avg_len, test_sents))
        test_preds = list(filter(lambda x: len(x) > avg_len, test_preds))
        test_targets = list(filter(lambda x: len(x) > avg_len, test_targets))
    elif shortest:
        avg_len = sum(len(s) for s in test_sents)/len(test_sents)
        test_sents = list(filter(lambda x: len(x) <= avg_len, test_sents))
        test_preds = list(filter(lambda x: len(x) <= avg_len, test_preds))
        test_targets = list(filter(lambda x: len(x) <= avg_len, test_targets))
    if detailed:
        for test_sent, test_target, test_pred in zip(test_sents, test_targets, test_preds):
            print(precision_recall_fscore_support(test_target, test_pred, average="weighted")[2], test_sent, test_target, test_pred)
    else:
        print(
            classification_report(
                [x for y in test_targets for x in y],
                [x for y in test_preds for x in y],
                target_names=["not_del", "del"],
                digits=5,
            )
        )
    with open(f"models/{model_name}/test_original_{epoch}.txt", "w") as oh, open(
        f"models/{model_name}/test_pred_{epoch}.txt", "w"
    ) as ph, open(f"models/{model_name}/test_gold_{epoch}.txt", "w") as gh:
        for sent, preds, golds in zip(test_sents, test_preds, test_targets):
            pred_compressed = [word for word, delete in zip(sent, preds) if not delete]
            gold_compressed = [word for word, delete in zip(sent, golds) if not delete]
            oh.write(" ".join(sent))
            ph.write(" ".join(pred_compressed))
            gh.write(" ".join(gold_compressed))
            oh.write("\n")
            ph.write("\n")
            gh.write("\n")
@main.command()
@click.option(
    "-c",
    "--corpus",
    "corpus_name",
    required=True,
    type=click.Choice(sorted(["google",])),
 )
@click.option("-w", "--model_weights", required=True)
@click.option("-p", "--prior", type=float, default=.5)
@click.option("-l", "--longest", is_flag=True)
@click.option("-s", "--shortest", is_flag=True)
 def predict(corpus_name, model_weights, prior, longest, shortest):
    if longest and shortest:
        print("longest and shortest are mutually exclusive", file=sys.stderr)
        sys.exit()
    corpus, word2index, index2word = load_corpus(corpus_name, ["test"])
    test_pairs = corpus["test"]
    model_name = os.path.basename(os.path.dirname(model_weights))
    epoch = re.search("_(\d+).tar", model_weights).group(1)
    logger.info("loading model checkpoint")
    checkpoint = torch.load(model_weights, map_location=config.DEV)
    if "word2index" in checkpoint:
        weights = checkpoint["weights"]
        word2index = checkpoint["word2index"]
        index2word = {i: w for w, i in word2index.items()}
    else:
        asdf
    network = init_network(word2index, prior)
    network.eval()
    # remove the embedding layer before loading
    # weights = {k: v for k, v in weights.items() if not "embedding" in k}
    # actually load the parameters
    network.load_state_dict(weights, strict=False)
    total_test_loss = 0
    batch_size = 20
    test_batch_iter = utils.sent_iter(
        sents=test_pairs, word2index=word2index, batch_size=batch_size
    )
    test_sents = []
    test_preds = []
    test_attentions = []
    test_fixations = []
    for i, batch in tqdm.tqdm(
        enumerate(test_batch_iter, 1), total=len(test_pairs) // batch_size + 1
    ):
        raw_sent, sent, target = batch
        seq_lens = [len(x) for x in sent]
        loss, prediction, attention, fixations = network(sent, target, seq_lens)
        prediction = prediction.detach().cpu().numpy()
        attention = attention.detach().cpu().numpy()
        if fixations is not None:
            fixations = fixations.detach().cpu().numpy()
        for i, l in enumerate(
            seq_lens
        ):
            test_sents.append(raw_sent[i][:l])
            test_preds.append(prediction[i][:l].tolist())
            test_attentions.append(attention[i][:l].tolist())
            if fixations is not None:
                test_fixations.append(fixations[i][:l].tolist())
            else:
                test_fixations.append([])
        total_test_loss += loss.item()
    avg_test_loss = total_test_loss / len(test_pairs)
    if longest:
        avg_len = sum(len(s) for s in test_sents)/len(test_sents)
        test_sents = list(filter(lambda x: len(x) > avg_len, test_sents))
        test_preds = list(filter(lambda x: len(x) > avg_len, test_preds))
        test_attentions = list(filter(lambda x: len(x) > avg_len, test_attentions))
        test_fixations = list(filter(lambda x: len(x) > avg_len, test_fixations))
    elif shortest:
        avg_len = sum(len(s) for s in test_sents)/len(test_sents)
        test_sents = list(filter(lambda x: len(x) <= avg_len, test_sents))
        test_preds = list(filter(lambda x: len(x) <= avg_len, test_preds))
        test_attentions = list(filter(lambda x: len(x) <= avg_len, test_attentions))
        test_fixations = list(filter(lambda x: len(x) <= avg_len, test_fixations))
    print(f"sentence\tprediction\tattentions\tfixations")
    for s, p, a, f in zip(test_sents, test_preds, test_attentions, test_fixations):
        a = [x[:len(a)] for x in a]
        print(f"{s}\t{p}\t{a}\t{f}")
 if __name__ == "__main__":
    main()
--- a/joint_sentence_compression_model/requirements.txt
+++ b/joint_sentence_compression_model/requirements.txt
@ -0,0 +1,23 @@
 click==7.1.2
 cycler==0.10.0
 dataclasses==0.6
 future==0.18.2
 joblib==0.17.0
 kiwisolver==1.3.1
 matplotlib==3.3.3
 nltk==3.5
 numpy==1.19.4
 Pillow==8.0.1
 portalocker==2.0.0
 pyparsing==2.4.7
 python-dateutil==2.8.1
 regex==2020.11.13
 sacrebleu==1.4.6
 scikit-learn==0.23.2
 scipy==1.5.4
 six==1.15.0
 sklearn==0.0
 threadpoolctl==2.1.0
 torch==1.7.0
 tqdm==4.54.1
 typing-extensions==3.7.4.3
--- a/joint_sentence_compression_model/utils/check_stats.py
+++ b/joint_sentence_compression_model/utils/check_stats.py
@ -0,0 +1,43 @@
 import ast
 from statistics import mean, pstdev
 import sys
 import click
 from scipy.stats import entropy
 from matplotlib import pyplot as plt
 def reader(path):
    with open(path) as h:
        for line in h:
            line = line.strip()
            try:
                s, p, a, f = line.split("\t")
            except:
                print(f"skipping line: {line}", file=sys.stderr)
            else:
                try:
                    yield ast.literal_eval(s), ast.literal_eval(p), ast.literal_eval(a), ast.literal_eval(f)
                except:
                    print(f"malformed line: {s}")
 def get_stats(seq):
    for s, p, a, f in seq:
        print(s)
        print(p)
        print(len(s), len(p), len(a), len(f))
        for x in a:
            print(len(x))
        print()
@click.command()
@click.argument("path")
 def main(path):
    get_stats(reader(path))
 if __name__ == "__main__":
    main()
--- a/joint_sentence_compression_model/utils/cr.py
+++ b/joint_sentence_compression_model/utils/cr.py
@ -0,0 +1,34 @@
 import ast
 from math import log2
 import os
 from statistics import mean, pstdev
 import sys
 import click
 import numpy as np
 from scipy.special import kl_div
 from matplotlib import pyplot as plt
 def reader(path):
    with open(path) as h:
        for line in h:
            line = line.strip()
            yield line.split()
@click.command()
@click.argument("original")
@click.argument("compressed")
 def main(original, compressed):
    ratio = 0
    total = 0
    for o, c in zip(reader(original), reader(compressed)):
        ratio += len(c)/len(o)
        total += 1
    print(f"cr: {ratio/total:.4f}")
 if __name__ == "__main__":
    main()
--- a/joint_sentence_compression_model/utils/kl.py
+++ b/joint_sentence_compression_model/utils/kl.py
@ -0,0 +1,88 @@
 import ast
 from math import log2
 import os
 from statistics import mean, pstdev
 import sys
 import click
 import numpy as np
 from scipy.special import kl_div
 from matplotlib import pyplot as plt
 def attention_reader(path):
    with open(path) as h:
        for line in h:
            line = line.strip()
            try:
                s, p, a, f = line.split("\t")
            except:
                print(f"skipping line: {line}", file=sys.stderr)
            else:
                try:
                    yield [[x[0] for x in y] for y in ast.literal_eval(a)]
                except:
                    print(f"skipping malformed line: {s}", file=sys.stderr)
 def _kl_divergence(p, q):
    p = np.asarray(p)
    q = np.asarray(q)
    p /= sum(p)
    q /= sum(q)
    return sum(p[i] * log2(p[i]/q[i]) for i in range(len(p)))
 def kl_divergence(ps, qs):
    kl = 0
    count = 0
    for p, q in zip(ps, qs):
        print(p, q)
        kl += _kl_divergence(p, q)
        count += 1
    return kl/count
 def _js_divergence(p, q):
    p = np.asarray(p)
    q = np.asarray(q)
    p /= sum(p)
    q /= sum(q)
    print(p, q)
    m = 0.5 * (p + q)
    return 0.5 * _kl_divergence(p, m) + 0.5 * _kl_divergence(q, m)
 def js_divergence(ps, qs):
    js = 0
    count = 0
    for p, q in zip(ps, qs):
        js += _js_divergence(p, q)
        count += 1
    return js/count
 def get_kl_div(seq1, seq2):
    return [js_divergence(x1, x2) for x1, x2 in zip(seq1, seq2)]
    return [kl_divergence(x1, x2) for x1, x2 in zip(seq1, seq2)]
@click.command()
@click.argument("ref")
@click.argument("path", nargs=-1)
 def main(ref, path):
    kls = []
    labels = []
    for p in path:
        labels.append(os.path.basename(p))
        kl = get_kl_div(attention_reader(ref), attention_reader(p))
        print(mean(kl))
        print(pstdev(kl))
        kls.append(kl)
    plt.boxplot(kls, labels=labels)
    plt.show()
    plt.clear()
 if __name__ == "__main__":
    main()
--- a/joint_sentence_compression_model/utils/kl_divergence.py
+++ b/joint_sentence_compression_model/utils/kl_divergence.py
@ -0,0 +1,112 @@
 '''
 Calculates the KL divergence between specified columns in a file or across columns of different files
 '''
 from math import log2
 import pandas as pd
 import ast
 import collections
 from sklearn import metrics
 import sys
 def kl_divergence(p, q):
    return sum(p[i] * log2(p[i] / q[i]) for i in range(len(p)))
 def flatten(x):
    if isinstance(x, collections.Iterable):
        return [a for i in x for a in flatten(i)]
    else:
        return [x]
 def get_data(file):
    names = ["sentence", "prediction", "attentions", "fixations"]
    df = pd.read_csv(file, sep='\t', names=names)
    df = df[2:]
    attentions = df.loc[:, "attentions"].tolist()
    fixations = df.loc[:, "fixations"].tolist()
    return attentions, fixations
 def attention_attention(attentions1, attentions2):
    divergence = []
    for att1, att2 in zip(attentions1, attentions2):
        current_att1 = ast.literal_eval(att1)
        current_att2 = ast.literal_eval(att2)
        lst_att1 = flatten(current_att1)
        lst_att2 = flatten(current_att2)
        try:
            kl_pq = metrics.mutual_info_score(lst_att1, lst_att2)
            divergence.append(kl_pq)
        except:
            divergence.append(None)
    avg = sum(divergence) / len(divergence)
    return avg
 def fixation_fixation(fixation1, fixation2):
    divergence = []
    for fix1, fix2 in zip(fixation1, fixation2):
        current_fixation1 = ast.literal_eval(fix1)
        current_fixation2 = ast.literal_eval(fix2)
        lst_fixation1 = flatten(current_fixation1)
        lst_fixation2 = flatten(current_fixation2)
        try:
            kl_pq = metrics.mutual_info_score(lst_fixation1, lst_fixation2)
            divergence.append(kl_pq)
        except:
            divergence.append(None)
    avg = sum(divergence) / len(divergence)
    return avg
 def attention_fixation(attentions,fixations):
    divergence = []
    for attention, fixation in zip(attentions, fixations):
        current_attention = ast.literal_eval(attention)
        current_fixation = ast.literal_eval(fixation)
        lst_attention = []
        for t in current_attention:
            attention_lst = flatten(t)
            lst_attention.append(sum(attention_lst)/len(attention_lst))
        lst_fixation = flatten(current_fixation)
        try:
            kl_pq = metrics.mutual_info_score(lst_attention, lst_fixation)
            divergence.append(kl_pq)
        except:
            divergence.append(None)
    avg = sum(divergence)/len(divergence)
    return avg
 def divergent_calculations(file1, file2=None, val1=None):
    attentions, fixations = get_data(file1)
    attentions2, fixations2 = get_data(file2)
    if file2:
        if val1 == "attention":
            divergence = attention_attention(attentions, attentions2)
        else:
            divergence = fixation_fixation(fixations, fixations2)
    else:
        divergence = attention_fixation(attentions, fixations)
    print ("DL Divergence: ", divergence)
 divergent_calculations(sys.argv[1], sys.argv[2], sys.argv[3])
--- a/joint_sentence_compression_model/utils/plot_attention.py
+++ b/joint_sentence_compression_model/utils/plot_attention.py
@ -0,0 +1,64 @@
 import ast
 import os
 import pathlib
 import text_attention
 import click
 import matplotlib.pyplot as plt
 plt.switch_backend("agg")
 import matplotlib.ticker as ticker
 import numpy as np
 import tqdm
 def plot_attention(input_sentence, output_words, attentions, path):
    # Set up figure with colorbar
    attentions = np.array(attentions)[:,:len(input_sentence)]
    fig = plt.figure()
    ax = fig.add_subplot(111)
    cax = ax.matshow(attentions, cmap="bone")
    fig.colorbar(cax)
    # Set up axes
    ax.set_xticklabels([""] + input_sentence + ["<__EOS__>"], rotation=90)
    ax.set_yticklabels([""] + output_words)
    # Show label at every tick
    ax.xaxis.set_major_locator(ticker.MultipleLocator(1))
    ax.yaxis.set_major_locator(ticker.MultipleLocator(1))
    plt.savefig(f"{path}.pdf")
    plt.close()
 def parse(p):
    with open(p) as h:
        for line in h:
            if not line or line.startswith("#"):
                continue
            _sentence, _prediction, _attention, _fixations = line.strip().split("\t")
            try:
                sentence = ast.literal_eval(_sentence)
                prediction = ast.literal_eval(_prediction)
                attention = ast.literal_eval(_attention)
            except:
                continue
            yield sentence, prediction, attention
@click.command()
@click.argument("path", nargs=-1, required=True)
 def main(path):
    for p in tqdm.tqdm(path):
        out_dir = os.path.splitext(p)[0]
        if out_dir == path:
            out_dir = f"{out_dir}_"
        pathlib.Path(out_dir).mkdir(exist_ok=True)
        for i, spa in enumerate(parse(p)):
            plot_attention(*spa, path=os.path.join(out_dir, str(i)))
 if __name__ == "__main__":
    main()
--- a/joint_sentence_compression_model/utils/text_attention.py
+++ b/joint_sentence_compression_model/utils/text_attention.py
@ -0,0 +1,70 @@
 # -*- coding: utf-8 -*-
 # @Author: Jie Yang
 # @Date:   2019-03-29 16:10:23
 # @Last Modified by:   Jie Yang,     Contact: jieynlp@gmail.com
 # @Last Modified time: 2019-04-12 09:56:12
 ## convert the text/attention list to latex code, which will further generates the text heatmap based on attention weights.
 import numpy as np
 latex_special_token = ["!@#$%^&*()"]
 def generate(text_list, attention_list, latex_file, color='red', rescale_value = False):
 	assert(len(text_list) == len(attention_list))
 	if rescale_value:
 		attention_list = rescale(attention_list)
 	word_num = len(text_list)
 	text_list = clean_word(text_list)
 	with open(latex_file,'w') as f:
 		f.write(r'''\documentclass[varwidth]{standalone}
 \special{papersize=210mm,297mm}
 \usepackage{color}
 \usepackage{tcolorbox}
 \usepackage{CJK}
 \usepackage{adjustbox}
 \tcbset{width=0.9\textwidth,boxrule=0pt,colback=red,arc=0pt,auto outer arc,left=0pt,right=0pt,boxsep=5pt}
 \begin{document}
 \begin{CJK*}{UTF8}{gbsn}'''+'\n')
 		string = r'''{\setlength{\fboxsep}{0pt}\colorbox{white!0}{\parbox{0.9\textwidth}{'''+"\n"
 		for idx in range(word_num):
 			string += "\\colorbox{%s!%s}{"%(color, attention_list[idx])+"\\strut " + text_list[idx]+"} "
 		string += "\n}}}"
 		f.write(string+'\n')
 		f.write(r'''\end{CJK*}
 \end{document}''')
 def rescale(input_list):
 	the_array = np.asarray(input_list)
 	the_max = np.max(the_array)
 	the_min = np.min(the_array)
 	rescale = (the_array - the_min)/(the_max-the_min)*100
 	return rescale.tolist()
 def clean_word(word_list):
 	new_word_list = []
 	for word in word_list:
 		for latex_sensitive in ["\\", "%", "&", "^", "#", "_",  "{", "}"]:
 			if latex_sensitive in word:
 				word = word.replace(latex_sensitive, '\\'+latex_sensitive)
 		new_word_list.append(word)
 	return new_word_list
 if __name__ == '__main__':
 	## This is a demo:
 	sent = '''the USS Ronald Reagan - an aircraft carrier docked in Japan - during his tour of the region, vowing to "defeat any attack and meet any use of conventional or nuclear weapons with an overwhelming and effective American response".
 North Korea and the US have ratcheted up tensions in recent weeks and the movement of the strike group had raised the question of a pre-emptive strike by the US.
 On Wednesday, Mr Pence described the country as the "most dangerous and urgent threat to peace and security" in the Asia-Pacific.'''
 	sent = '''我 回忆 起 我 曾经 在 大学 年代 ， 我们 经常 喜欢 玩 “ Hawaii guitar ” 。 说起 Guitar ， 我 想起 了 西游记 里 的 琵琶精 。
 	今年 下半年 ， 中 美 合拍 的 西游记 即将 正式 开机 ， 我 继续 扮演 美猴王 孙悟空 ， 我 会 用 美猴王 艺术 形象 努力 创造 一 个 正能量 的 形象 ， 文 体 两 开花 ， 弘扬 中华 文化 ， 希望 大家 能 多多 关注 。'''
 	words = sent.split()
 	word_num = len(words)
 	attention = [(x+1.)/word_num*100 for x in range(word_num)]
 	import random
 	random.seed(42)
 	random.shuffle(attention)
 	color = 'red'
 	generate(words, attention, "sample.tex", color)
		`@ -0,0 +1,3 @@`
							`# joint_paraphrase_model`

							`joint training paraphrase model --- neurips`
		`@ -0,0 +1 @@`
							`/netpool/work/gpu-2/users/soodea/datasets/glove/glove.840B.300d.txt`
		`@ -0,0 +1,3 @@`
							`# joint_sentence_compression`

							`joint training for sentence compression -- neurips submission`