human-gaze-guided-neural-at.../joint_paraphrase_model/libs/corpora.py

import logging

import config


def tokenize(sent):
    return sent.split(" ")


class Lang:
    """Represents the vocabulary
    """
    def __init__(self, name):
        self.name = name
        self.word2index = {
            config.PAD: 0,
            config.UNK: 1,
            config.NOFIX: 2,
            config.SOS: 3,
            config.EOS: 4,
        }
        self.word2count = {}
        self.index2word = {
            0: config.PAD,
            1: config.UNK,
            2: config.NOFIX,
            3: config.SOS,
            4: config.EOS,
        }
        self.n_words = 5

    def add_sentence(self, sentence):
        assert isinstance(
            sentence, (list, tuple)
        ), "input to add_sentence must be tokenized"
        for word in sentence:
            self.add_word(word)

    def add_word(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.word2count[word] = 1
            self.index2word[self.n_words] = word
            self.n_words += 1
        else:
            self.word2count[word] += 1

    def __add__(self, other):
        """Returns a new Lang object containing the vocabulary from this and
        the other Lang object
        """
        new_lang = Lang(f"{self.name}_{other.name}")

        # Add vocabulary from both Langs
        for word in self.word2count.keys():
            new_lang.add_word(word)
        for word in other.word2count.keys():
            new_lang.add_word(word)

        # Fix the counts on the new one
        for word in new_lang.word2count.keys():
            new_lang.word2count[word] = self.word2count.get(
                word, 0
            ) + other.word2count.get(word, 0)

        return new_lang


def load_wiki(split):
    """Load the Wiki from PAWs"""
    logger = logging.getLogger(f"{__name__}.load_wiki")
    lang = Lang("wiki")

    if split == "train":
        path = config.wiki_train_path
    elif split == "val":
        path = config.wiki_dev_path
    elif split == "test":
        path = config.wiki_test_path

    logger.info("loading %s from %s" % (split, path))

    pairs = []
    with open(path) as handle:

        # skip header
        handle.readline()

        for line in handle:
            _, sent1, sent2, rating = line.strip().split("\t")
            if rating == "0":
                continue
            sent1 = tokenize(sent1)
            sent2 = tokenize(sent2)
            lang.add_sentence(sent1)
            lang.add_sentence(sent2)

            # pairs.append([sent1, sent2, rating])
            pairs.append([sent1, sent2])

    # MS makes the vocab for paraphrase the same
    return pairs, lang


def load_qqp_paws(split):
    """Load the QQP from PAWs"""
    logger = logging.getLogger(f"{__name__}.load_qqp_paws")
    lang = Lang("qqp_paws")

    if split == "train":
        path = config.qqp_paws_train_path
    elif split == "val":
        path = config.qqp_paws_dev_path
    elif split == "test":
        path = config.qqp_paws_test_path

    logger.info("loading %s from %s" % (split, path))

    pairs = []
    with open(path) as handle:

        # skip header
        handle.readline()

        for line in handle:
            _, sent1, sent2, rating = line.strip().split("\t")
            if rating == "0":
                continue
            sent1 = tokenize(sent1)
            sent2 = tokenize(sent2)
            lang.add_sentence(sent1)
            lang.add_sentence(sent2)

            # pairs.append([sent1, sent2, rating])
            pairs.append([sent1, sent2])

    # MS makes the vocab for paraphrase the same
    return pairs, lang

def load_qqp(split):
    """Load the QQP from Original"""
    logger = logging.getLogger(f"{__name__}.load_qqp")
    lang = Lang("qqp")

    if split == "train":
        path = config.qqp_train_path
    elif split == "val":
        path = config.qqp_dev_path
    elif split == "test":
        path = config.qqp_test_path

    logger.info("loading %s from %s" % (split, path))

    pairs = []
    with open(path) as handle:

        # skip header
        handle.readline()

        for line in handle:
            rating, sent1, sent2, _ = line.strip().split("\t")
            if rating == "0":
                continue
            sent1 = tokenize(sent1)
            sent2 = tokenize(sent2)
            lang.add_sentence(sent1)
            lang.add_sentence(sent2)

            # pairs.append([sent1, sent2, rating])
            pairs.append([sent1, sent2])

    # MS makes the vocab for paraphrase the same
    return pairs, lang


def load_qqp_kag(split):
    """Load the QQP from Kaggle""" #not original right now, expriemnting with kaggle 100K, 3K, 30K split
    logger = logging.getLogger(f"{__name__}.load_qqp_kag")
    lang = Lang("qqp_kag")

    if split == "train":
        path = config.qqp_kag_train_path
    elif split == "val":
        path = config.qqp_kag_dev_path
    elif split == "test":
        path = config.qqp_kag_test_path

    logger.info("loading %s from %s" % (split, path))

    pairs = []
    with open(path) as handle:

        # skip header
        handle.readline()

        for line in handle: #when reading the kag version we do not have 4 fields, but rather 3
            rating, sent1, sent2 = line.strip().split("\t")
            if rating == "0":
                continue
            sent1 = tokenize(sent1)
            sent2 = tokenize(sent2)
            lang.add_sentence(sent1)
            lang.add_sentence(sent2)

            # pairs.append([sent1, sent2, rating])
            pairs.append([sent1, sent2])

    # MS makes the vocab for paraphrase the same
    return pairs, lang


def load_msrpc(split):
    """Load the Microsoft Research Paraphrase Corpus (MSRPC)"""
    logger = logging.getLogger(f"{__name__}.load_msrpc")
    lang = Lang("msrpc")

    if split == "train":
        path = config.msrpc_train_path
    elif split == "val":
        path = config.msrpc_dev_path
    elif split == "test":
        path = config.msrpc_test_path

    logger.info("loading %s from %s" % (split, path))

    pairs = []
    with open(path) as handle:

        # skip header
        handle.readline()

        for line in handle:
            rating, _, _, sent1, sent2 = line.strip().split("\t")
            if rating == "0":
                continue
            sent1 = tokenize(sent1)
            sent2 = tokenize(sent2)
            lang.add_sentence(sent1)
            lang.add_sentence(sent2)

            # pairs.append([sent1, sent2, rating])
            pairs.append([sent1, sent2])

    # return src_lang, dst_lang, pairs
    # MS makes the vocab for paraphrase the same

    return pairs, lang

def load_sentiment(split):
    """Load the Sentiment Kaggle Comp Dataset"""
    logger = logging.getLogger(f"{__name__}.load_sentiment")
    lang = Lang("sentiment")

    if split == "train":
        path = config.sentiment_train_path
    elif split == "val":
        path = config.sentiment_dev_path
    elif split == "test":
        path = config.sentiment_test_path

    logger.info("loading %s from %s" % (split, path))

    pairs = []

    with open(path) as handle:

        # skip header
        handle.readline()

        for line in handle:
            _, _, sent1, sent2 = line.strip().split("\t")

            sent1 = tokenize(sent1)
            sent2 = tokenize(sent2)
            lang.add_sentence(sent1)
            lang.add_sentence(sent2)

            # pairs.append([sent1, sent2, rating])
            pairs.append([sent1, sent2])

    return pairs, lang


def load_tamil(split):
    """Load the En to Tamil dataset, current SOTA ~13 bleu"""
    logger = logging.getLogger(f"{__name__}.load_tamil")
    lang = Lang("tamil")

    if split == "train":
        path = config.tamil_train_path
    elif split == "val":
        path = config.tamil_dev_path
    elif split == "test":
        path = config.tamil_test_path

    logger.info("loading %s from %s" % (split, path))

    pairs = []
    with open(path) as handle:

        handle.readline()

        for line in handle:
            sent1, sent2 = line.strip().split("\t")
            #if rating == "0":
            #    continue
            sent1 = tokenize(sent1)
            #I dunno how to tokenize tamil.....?
            sent2 = tokenize(sent2)
            lang.add_sentence(sent1)
            lang.add_sentence(sent2)

            pairs.append([sent1, sent2])

    return pairs, lang

def load_compression(split):
    """Load the Google Sentence Compression Dataset"""
    logger = logging.getLogger(f"{__name__}.load_compression")
    lang = Lang("compression")

    if split == "train":
        path = config.compression_train_path
    elif split == "val":
        path = config.compression_dev_path
    elif split == "test":
        path = config.compression_test_path

    logger.info("loading %s from %s" % (split, path))

    pairs = []
    with open(path) as handle:

        handle.readline()

        for line in handle:
            sent1, sent2 = line.strip().split("\t")
            sent1 = tokenize(sent1)
            sent2 = tokenize(sent2)
           # print(len(sent1), sent1)
           # print(len(sent2), sent2)
           # print()
            lang.add_sentence(sent1)
            lang.add_sentence(sent2)

            pairs.append([sent1, sent2])

    return pairs, lang

def load_stanford(split):
    """Load the Stanford Sentiment Dataset phrases"""
    logger = logging.getLogger(f"{__name__}.load_stanford")
    lang = Lang("stanford")

    if split == "train":
        path = config.stanford_train_path
    elif split == "val":
        path = config.stanford_dev_path
    elif split == "test":
        path = config.stanford_test_path

    logger.info("loading %s from %s" % (split, path))

    pairs = []

    with open(path) as handle:

        # skip header
        #handle.readline()

        for line in handle:
            _, _, sent1, sent2 = line.strip().split("\t")

            sent1 = tokenize(sent1)
            sent2 = tokenize(sent2)
            lang.add_sentence(sent1)
            lang.add_sentence(sent2)

            # pairs.append([sent1, sent2, rating])
            pairs.append([sent1, sent2])

    return pairs, lang

def load_stanford_sent(split):
    """Load the Stanford Sentiment Dataset sentences"""
    logger = logging.getLogger(f"{__name__}.load_stanford_sent")
    lang = Lang("stanford_sent")

    if split == "train":
        path = config.stanford_sent_train_path
    elif split == "val":
        path = config.stanford_sent_dev_path
    elif split == "test":
        path = config.stanford_sent_test_path

    logger.info("loading %s from %s" % (split, path))

    pairs = []

    with open(path) as handle:

        # skip header
        #handle.readline()

        for line in handle:
            _, _, sent1, sent2 = line.strip().split("\t")

            sent1 = tokenize(sent1)
            sent2 = tokenize(sent2)
            lang.add_sentence(sent1)
            lang.add_sentence(sent2)

            # pairs.append([sent1, sent2, rating])
            pairs.append([sent1, sent2])

    return pairs, lang