import torch import os import numpy as np import random import pickle import sys sys.path.insert(0, os.path.dirname(os.path.dirname(__file__))) from utils.data_utils import encode_input, encode_input_with_mask, encode_image_input from dataloader.dataloader_base import DatasetBase class VisdialDataset(DatasetBase): def __init__(self, config): super(VisdialDataset, self).__init__(config) def __getitem__(self, index): MAX_SEQ_LEN = self.config['max_seq_len'] cur_data = None if self._split == 'train': cur_data = self.visdial_data_train['data'] ques_adj_matrices_dir = os.path.join(self.config['visdial_question_adj_matrices'], 'train') hist_adj_matrices_dir = os.path.join(self.config['visdial_history_adj_matrices'], 'train') elif self._split == 'val': cur_data = self.visdial_data_val['data'] ques_adj_matrices_dir = os.path.join(self.config['visdial_question_adj_matrices'], 'val') hist_adj_matrices_dir = os.path.join(self.config['visdial_history_adj_matrices'], 'val') else: cur_data = self.visdial_data_test['data'] ques_adj_matrices_dir = os.path.join(self.config['visdial_question_adj_matrices'], 'test') hist_adj_matrices_dir = os.path.join(self.config['visdial_history_adj_matrices'], 'test') if self.config['visdial_version'] == 0.9: ques_adj_matrices_dir = os.path.join(self.config['visdial_question_adj_matrices'], 'train') hist_adj_matrices_dir = os.path.join(self.config['visdial_history_adj_matrices'], 'train') self.num_bad_samples = 0 # number of options to score on num_options = self.num_options assert num_options > 1 and num_options <= 100 num_dialog_rounds = 10 dialog = cur_data['dialogs'][index] cur_questions = cur_data['questions'] cur_answers = cur_data['answers'] img_id = dialog['image_id'] graph_idx = dialog.get('dialog_idx', index) if self._split == 'train': # caption sent = dialog['caption'].split(' ') sentences = ['[CLS]'] tot_len = 1 # for the CLS token sentence_map = [0] # for the CLS token sentence_count = 0 speakers = [0] tokenized_sent, sentences, tot_len, sentence_count, sentence_map, speakers = \ self.tokenize_utterance(sent, sentences, tot_len, sentence_count, sentence_map, speakers) utterances = [[tokenized_sent]] utterances_random = [[tokenized_sent]] for rnd, utterance in enumerate(dialog['dialog']): cur_rnd_utterance = utterances[-1].copy() cur_rnd_utterance_random = utterances[-1].copy() # question sent = cur_questions[utterance['question']].split(' ') tokenized_sent, sentences, tot_len, sentence_count, sentence_map, speakers = \ self.tokenize_utterance(sent, sentences, tot_len, sentence_count, sentence_map, speakers) cur_rnd_utterance.append(tokenized_sent) cur_rnd_utterance_random.append(tokenized_sent) # answer sent = cur_answers[utterance['answer']].split(' ') tokenized_sent, sentences, tot_len, sentence_count, sentence_map, speakers = \ self.tokenize_utterance(sent, sentences, tot_len, sentence_count, sentence_map, speakers) cur_rnd_utterance.append(tokenized_sent) utterances.append(cur_rnd_utterance) # randomly select one random utterance in that round num_inds = len(utterance['answer_options']) gt_option_ind = utterance['gt_index'] negative_samples = [] for _ in range(self.config["num_negative_samples"]): all_inds = list(range(100)) all_inds.remove(gt_option_ind) all_inds = all_inds[:(num_options-1)] tokenized_random_utterance = None option_ind = None while len(all_inds): option_ind = random.choice(all_inds) tokenized_random_utterance = self.tokenizer.convert_tokens_to_ids(cur_answers[utterance['answer_options'][option_ind]].split(' ')) # the 1 here is for the sep token at the end of each utterance if(MAX_SEQ_LEN >= (tot_len + len(tokenized_random_utterance) + 1)): break else: all_inds.remove(option_ind) if len(all_inds) == 0: # all the options exceed the max len. Truncate the last utterance in this case. tokenized_random_utterance = tokenized_random_utterance[:len(tokenized_sent)] t = cur_rnd_utterance_random.copy() t.append(tokenized_random_utterance) negative_samples.append(t) utterances_random.append(negative_samples) # removing the caption in the beginning utterances = utterances[1:] utterances_random = utterances_random[1:] assert len(utterances) == len(utterances_random) == num_dialog_rounds assert tot_len <= MAX_SEQ_LEN, '{} {} tot_len = {} > max_seq_len'.format( self._split, index, tot_len ) tokens_all = [] question_limits_all = [] question_edge_indices_all = [] question_edge_attributes_all = [] history_edge_indices_all = [] history_sep_indices_all = [] mask_all = [] segments_all = [] sep_indices_all = [] next_labels_all = [] hist_len_all = [] # randomly pick several rounds to train pos_rounds = sorted(random.sample(range(num_dialog_rounds), self.config['sequences_per_image'] // 2), reverse=True) neg_rounds = sorted(random.sample(range(num_dialog_rounds), self.config['sequences_per_image'] // 2), reverse=True) tokens_all_rnd = [] question_limits_all_rnd = [] mask_all_rnd = [] segments_all_rnd = [] sep_indices_all_rnd = [] next_labels_all_rnd = [] hist_len_all_rnd = [] for j in pos_rounds: context = utterances[j] context, start_segment = self.pruneRounds(context, self.config['visdial_tot_rounds']) if j == pos_rounds[0]: # dialog with positive label and max rounds tokens, segments, sep_indices, mask, input_mask, start_question, end_question = encode_input_with_mask(context, start_segment, self.CLS, self.SEP, self.MASK, max_seq_len=MAX_SEQ_LEN, mask_prob=self.config["mask_prob"]) else: tokens, segments, sep_indices, mask, start_question, end_question = encode_input(context, start_segment, self.CLS, self.SEP, self.MASK, max_seq_len=MAX_SEQ_LEN, mask_prob=self.config["mask_prob"]) tokens_all_rnd.append(tokens) question_limits_all_rnd.append(torch.tensor([start_question, end_question])) mask_all_rnd.append(mask) sep_indices_all_rnd.append(sep_indices) next_labels_all_rnd.append(torch.LongTensor([0])) segments_all_rnd.append(segments) hist_len_all_rnd.append(torch.LongTensor([len(context)-1])) tokens_all.append(torch.cat(tokens_all_rnd,0).unsqueeze(0)) mask_all.append(torch.cat(mask_all_rnd,0).unsqueeze(0)) question_limits_all.extend(question_limits_all_rnd) segments_all.append(torch.cat(segments_all_rnd, 0).unsqueeze(0)) sep_indices_all.append(torch.cat(sep_indices_all_rnd, 0).unsqueeze(0)) next_labels_all.append(torch.cat(next_labels_all_rnd, 0).unsqueeze(0)) hist_len_all.append(torch.cat(hist_len_all_rnd,0).unsqueeze(0)) assert len(pos_rounds) == 1 question_graphs = pickle.load( open(os.path.join(ques_adj_matrices_dir, f'{graph_idx}.pkl'), 'rb') ) question_graph_pos = question_graphs[pos_rounds[0]] question_edge_index_pos = [] question_edge_attribute_pos = [] for edge_idx, edge_attr in question_graph_pos: question_edge_index_pos.append(edge_idx) edge_attr_one_hot = np.zeros((len(self.parse_vocab) + 1,), dtype=np.float32) edge_attr_one_hot[self.parse_vocab.get(edge_attr, len(self.parse_vocab))] = 1.0 question_edge_attribute_pos.append(edge_attr_one_hot) question_edge_index_pos = np.array(question_edge_index_pos, dtype=np.float64) question_edge_attribute_pos = np.stack(question_edge_attribute_pos, axis=0) question_edge_indices_all.append( torch.from_numpy(question_edge_index_pos).t().long().contiguous() ) question_edge_attributes_all.append( torch.from_numpy(question_edge_attribute_pos) ) history_edge_indices = pickle.load( open(os.path.join(hist_adj_matrices_dir, f'{graph_idx}.pkl'), 'rb') ) history_edge_indices_all.append( torch.tensor(history_edge_indices[pos_rounds[0]]).t().long().contiguous() ) # Get the [SEP] tokens that will represent the history graph node features hist_idx_pos = [i * 2 for i in range(pos_rounds[0] + 1)] sep_indices = sep_indices.squeeze(0).numpy() history_sep_indices_all.append(torch.from_numpy(sep_indices[hist_idx_pos])) if len(neg_rounds) > 0: tokens_all_rnd = [] question_limits_all_rnd = [] mask_all_rnd = [] segments_all_rnd = [] sep_indices_all_rnd = [] next_labels_all_rnd = [] hist_len_all_rnd = [] for j in neg_rounds: negative_samples = utterances_random[j] for context_random in negative_samples: context_random, start_segment = self.pruneRounds(context_random, self.config['visdial_tot_rounds']) tokens_random, segments_random, sep_indices_random, mask_random, start_question, end_question = encode_input(context_random, start_segment, self.CLS, self.SEP, self.MASK, max_seq_len=MAX_SEQ_LEN, mask_prob=self.config["mask_prob"]) tokens_all_rnd.append(tokens_random) question_limits_all_rnd.append(torch.tensor([start_question, end_question])) mask_all_rnd.append(mask_random) sep_indices_all_rnd.append(sep_indices_random) next_labels_all_rnd.append(torch.LongTensor([1])) segments_all_rnd.append(segments_random) hist_len_all_rnd.append(torch.LongTensor([len(context_random)-1])) tokens_all.append(torch.cat(tokens_all_rnd,0).unsqueeze(0)) mask_all.append(torch.cat(mask_all_rnd,0).unsqueeze(0)) question_limits_all.extend(question_limits_all_rnd) segments_all.append(torch.cat(segments_all_rnd, 0).unsqueeze(0)) sep_indices_all.append(torch.cat(sep_indices_all_rnd, 0).unsqueeze(0)) next_labels_all.append(torch.cat(next_labels_all_rnd, 0).unsqueeze(0)) hist_len_all.append(torch.cat(hist_len_all_rnd,0).unsqueeze(0)) assert len(neg_rounds) == 1 question_graph_neg = question_graphs[neg_rounds[0]] question_edge_index_neg = [] question_edge_attribute_neg = [] for edge_idx, edge_attr in question_graph_neg: question_edge_index_neg.append(edge_idx) edge_attr_one_hot = np.zeros((len(self.parse_vocab) + 1,), dtype=np.float32) edge_attr_one_hot[self.parse_vocab.get(edge_attr, len(self.parse_vocab))] = 1.0 question_edge_attribute_neg.append(edge_attr_one_hot) question_edge_index_neg = np.array(question_edge_index_neg, dtype=np.float64) question_edge_attribute_neg = np.stack(question_edge_attribute_neg, axis=0) question_edge_indices_all.append( torch.from_numpy(question_edge_index_neg).t().long().contiguous() ) question_edge_attributes_all.append( torch.from_numpy(question_edge_attribute_neg) ) history_edge_indices_all.append( torch.tensor(history_edge_indices[neg_rounds[0]]).t().long().contiguous() ) # Get the [SEP] tokens that will represent the history graph node features hist_idx_neg = [i * 2 for i in range(neg_rounds[0] + 1)] sep_indices_random = sep_indices_random.squeeze(0).numpy() history_sep_indices_all.append(torch.from_numpy(sep_indices_random[hist_idx_neg])) tokens_all = torch.cat(tokens_all, 0) # [2, num_pos, max_len] question_limits_all = torch.stack(question_limits_all, 0) # [2, 2] mask_all = torch.cat(mask_all,0) segments_all = torch.cat(segments_all, 0) sep_indices_all = torch.cat(sep_indices_all, 0) next_labels_all = torch.cat(next_labels_all, 0) hist_len_all = torch.cat(hist_len_all, 0) input_mask_all = torch.LongTensor(input_mask) # [max_len] item = {} item['tokens'] = tokens_all item['question_limits'] = question_limits_all item['question_edge_indices'] = question_edge_indices_all item['question_edge_attributes'] = question_edge_attributes_all item['history_edge_indices'] = history_edge_indices_all item['history_sep_indices'] = history_sep_indices_all item['segments'] = segments_all item['sep_indices'] = sep_indices_all item['mask'] = mask_all item['next_sentence_labels'] = next_labels_all item['hist_len'] = hist_len_all item['input_mask'] = input_mask_all # get image features if not self.config['dataloader_text_only']: features, num_boxes, boxes, _ , image_target, image_edge_indexes, image_edge_attributes = self._image_features_reader[img_id] features, spatials, image_mask, image_target, image_label = encode_image_input(features, num_boxes, boxes, image_target, max_regions=self._max_region_num) else: features = spatials = image_mask = image_target = image_label = torch.tensor([0]) elif self._split == 'val': gt_relevance = None gt_option_inds = [] options_all = [] # caption sent = dialog['caption'].split(' ') sentences = ['[CLS]'] tot_len = 1 # for the CLS token sentence_map = [0] # for the CLS token sentence_count = 0 speakers = [0] tokenized_sent, sentences, tot_len, sentence_count, sentence_map, speakers = \ self.tokenize_utterance(sent, sentences, tot_len, sentence_count, sentence_map, speakers) utterances = [[tokenized_sent]] for rnd, utterance in enumerate(dialog['dialog']): cur_rnd_utterance = utterances[-1].copy() # question sent = cur_questions[utterance['question']].split(' ') tokenized_sent, sentences, tot_len, sentence_count, sentence_map, speakers = \ self.tokenize_utterance(sent, sentences, tot_len, sentence_count, sentence_map, speakers) cur_rnd_utterance.append(tokenized_sent) # current round gt_option_ind = utterance['gt_index'] # first select gt option id, then choose the first num_options inds option_inds = [] option_inds.append(gt_option_ind) all_inds = list(range(100)) all_inds.remove(gt_option_ind) all_inds = all_inds[:(num_options-1)] option_inds.extend(all_inds) gt_option_inds.append(0) cur_rnd_options = [] answer_options = [utterance['answer_options'][k] for k in option_inds] assert len(answer_options) == len(option_inds) == num_options assert answer_options[0] == utterance['answer'] # for evaluation of all options and dense relevance if self.visdial_data_val_dense: if rnd == self.visdial_data_val_dense[index]['round_id'] - 1: # only 1 round has gt_relevance for each example if 'relevance' in self.visdial_data_val_dense[index]: gt_relevance = torch.Tensor(self.visdial_data_val_dense[index]['relevance']) else: gt_relevance = torch.Tensor(self.visdial_data_val_dense[index]['gt_relevance']) # shuffle based on new indices gt_relevance = gt_relevance[torch.LongTensor(option_inds)] else: gt_relevance = -1 for answer_option in answer_options: cur_rnd_cur_option = cur_rnd_utterance.copy() cur_rnd_cur_option.append(self.tokenizer.convert_tokens_to_ids(cur_answers[answer_option].split(' '))) cur_rnd_options.append(cur_rnd_cur_option) # answer sent = cur_answers[utterance['answer']].split(' ') tokenized_sent, sentences, tot_len, sentence_count, sentence_map, speakers = \ self.tokenize_utterance(sent, sentences, tot_len, sentence_count, sentence_map, speakers) cur_rnd_utterance.append(tokenized_sent) utterances.append(cur_rnd_utterance) options_all.append(cur_rnd_options) # encode the input and create batch x 10 x 100 * max_len arrays (batch x num_rounds x num_options) tokens_all = [] question_limits_all = [] mask_all = [] segments_all = [] sep_indices_all = [] hist_len_all = [] history_sep_indices_all = [] for rnd, cur_rnd_options in enumerate(options_all): tokens_all_rnd = [] mask_all_rnd = [] segments_all_rnd = [] sep_indices_all_rnd = [] hist_len_all_rnd = [] for j, cur_rnd_option in enumerate(cur_rnd_options): cur_rnd_option, start_segment = self.pruneRounds(cur_rnd_option, self.config['visdial_tot_rounds']) if rnd == len(options_all) - 1 and j == 0: # gt dialog tokens, segments, sep_indices, mask, input_mask, start_question, end_question = encode_input_with_mask(cur_rnd_option, start_segment, self.CLS, self.SEP, self.MASK, max_seq_len=MAX_SEQ_LEN, mask_prob=0) else: tokens, segments, sep_indices, mask, start_question, end_question = encode_input(cur_rnd_option, start_segment,self.CLS, self.SEP, self.MASK ,max_seq_len=MAX_SEQ_LEN, mask_prob=0) tokens_all_rnd.append(tokens) mask_all_rnd.append(mask) segments_all_rnd.append(segments) sep_indices_all_rnd.append(sep_indices) hist_len_all_rnd.append(torch.LongTensor([len(cur_rnd_option)-1])) question_limits_all.append(torch.tensor([start_question, end_question]).unsqueeze(0).repeat(100, 1)) tokens_all.append(torch.cat(tokens_all_rnd,0).unsqueeze(0)) mask_all.append(torch.cat(mask_all_rnd,0).unsqueeze(0)) segments_all.append(torch.cat(segments_all_rnd,0).unsqueeze(0)) sep_indices_all.append(torch.cat(sep_indices_all_rnd,0).unsqueeze(0)) hist_len_all.append(torch.cat(hist_len_all_rnd,0).unsqueeze(0)) # Get the [SEP] tokens that will represent the history graph node features # It will be the same for all answer candidates as the history does not change # for each answer hist_idx = [i * 2 for i in range(rnd + 1)] history_sep_indices_all.extend(sep_indices.squeeze(0)[hist_idx].contiguous() for _ in range(100)) tokens_all = torch.cat(tokens_all, 0) # [10, 100, max_len] mask_all = torch.cat(mask_all, 0) segments_all = torch.cat(segments_all, 0) sep_indices_all = torch.cat(sep_indices_all, 0) hist_len_all = torch.cat(hist_len_all, 0) input_mask_all = torch.LongTensor(input_mask) # [max_len] # load graph data question_limits_all = torch.stack(question_limits_all, 0) # [10, 100, 2] question_graphs = pickle.load( open(os.path.join(ques_adj_matrices_dir, f'{graph_idx}.pkl'), 'rb') ) question_edge_indices_all = [] # [10, N] we do not repeat it 100 times here question_edge_attributes_all = [] # [10, N] we do not repeat it 100 times here for q_graph_round in question_graphs: question_edge_index = [] question_edge_attribute = [] for edge_index, edge_attr in q_graph_round: question_edge_index.append(edge_index) edge_attr_one_hot = np.zeros((len(self.parse_vocab) + 1,), dtype=np.float32) edge_attr_one_hot[self.parse_vocab.get(edge_attr, len(self.parse_vocab))] = 1.0 question_edge_attribute.append(edge_attr_one_hot) question_edge_index = np.array(question_edge_index, dtype=np.float64) question_edge_attribute = np.stack(question_edge_attribute, axis=0) question_edge_indices_all.extend( [torch.from_numpy(question_edge_index).t().long().contiguous() for _ in range(100)]) question_edge_attributes_all.extend( [torch.from_numpy(question_edge_attribute).contiguous() for _ in range(100)]) _history_edge_incides_all = pickle.load( open(os.path.join(hist_adj_matrices_dir, f'{graph_idx}.pkl'), 'rb') ) history_edge_incides_all = [] for hist_edge_indices_rnd in _history_edge_incides_all: history_edge_incides_all.extend( [torch.tensor(hist_edge_indices_rnd).t().long().contiguous() for _ in range(100)] ) item = {} item['tokens'] = tokens_all item['segments'] = segments_all item['sep_indices'] = sep_indices_all item['mask'] = mask_all item['hist_len'] = hist_len_all item['input_mask'] = input_mask_all item['gt_option_inds'] = torch.LongTensor(gt_option_inds) # return dense annotation data as well if self.visdial_data_val_dense: item['round_id'] = torch.LongTensor([self.visdial_data_val_dense[index]['round_id']]) item['gt_relevance'] = gt_relevance item['question_limits'] = question_limits_all item['question_edge_indices'] = question_edge_indices_all item['question_edge_attributes'] = question_edge_attributes_all item['history_edge_indices'] = history_edge_incides_all item['history_sep_indices'] = history_sep_indices_all # get image features if not self.config['dataloader_text_only']: features, num_boxes, boxes, _ , image_target, image_edge_indexes, image_edge_attributes = self._image_features_reader[img_id] features, spatials, image_mask, image_target, image_label = encode_image_input( features, num_boxes, boxes, image_target, max_regions=self._max_region_num, mask_prob=0) else: features = spatials = image_mask = image_target = image_label = torch.tensor([0]) elif self.split == 'test': assert num_options == 100 cur_rnd_utterance = [self.tokenizer.convert_tokens_to_ids(dialog['caption'].split(' '))] options_all = [] for rnd,utterance in enumerate(dialog['dialog']): cur_rnd_utterance.append(self.tokenizer.convert_tokens_to_ids(cur_questions[utterance['question']].split(' '))) if rnd != len(dialog['dialog'])-1: cur_rnd_utterance.append(self.tokenizer.convert_tokens_to_ids(cur_answers[utterance['answer']].split(' '))) for answer_option in dialog['dialog'][-1]['answer_options']: cur_option = cur_rnd_utterance.copy() cur_option.append(self.tokenizer.convert_tokens_to_ids(cur_answers[answer_option].split(' '))) options_all.append(cur_option) tokens_all = [] mask_all = [] segments_all = [] sep_indices_all = [] hist_len_all = [] for j, option in enumerate(options_all): option, start_segment = self.pruneRounds(option, self.config['visdial_tot_rounds']) tokens, segments, sep_indices, mask = encode_input(option, start_segment ,self.CLS, self.SEP, self.MASK ,max_seq_len=MAX_SEQ_LEN, mask_prob=0) tokens_all.append(tokens) mask_all.append(mask) segments_all.append(segments) sep_indices_all.append(sep_indices) hist_len_all.append(torch.LongTensor([len(option)-1])) tokens_all = torch.cat(tokens_all,0) mask_all = torch.cat(mask_all,0) segments_all = torch.cat(segments_all, 0) sep_indices_all = torch.cat(sep_indices_all, 0) hist_len_all = torch.cat(hist_len_all,0) hist_idx = [i*2 for i in range(len(dialog['dialog']))] history_sep_indices_all = [sep_indices.squeeze(0)[hist_idx].contiguous() for _ in range(num_options)] with open(os.path.join(ques_adj_matrices_dir, f'{graph_idx}.pkl'), 'rb') as f: question_graphs = pickle.load(f) q_graph_last = question_graphs[-1] question_edge_index = [] question_edge_attribute = [] for edge_index, edge_attr in q_graph_last: question_edge_index.append(edge_index) edge_attr_one_hot = np.zeros((len(self.parse_vocab) + 1,), dtype=np.float32) edge_attr_one_hot[self.parse_vocab.get(edge_attr, len(self.parse_vocab))] = 1.0 question_edge_attribute.append(edge_attr_one_hot) question_edge_index = np.array(question_edge_index, dtype=np.float64) question_edge_attribute = np.stack(question_edge_attribute, axis=0) question_edge_indices_all = [torch.from_numpy(question_edge_index).t().long().contiguous() for _ in range(num_options)] question_edge_attributes_all = [torch.from_numpy(question_edge_attribute).contiguous() for _ in range(num_options)] with open(os.path.join(hist_adj_matrices_dir, f'{graph_idx}.pkl'), 'rb') as f: _history_edge_incides_all = pickle.load(f) _history_edge_incides_last = _history_edge_incides_all[-1] history_edge_index_all = [torch.tensor(_history_edge_incides_last).t().long().contiguous() for _ in range(num_options)] if self.config['stack_gr_data']: question_edge_indices_all = torch.stack(question_edge_indices_all, dim=0) question_edge_attributes_all = torch.stack(question_edge_attributes_all, dim=0) history_edge_index_all = torch.stack(history_edge_index_all, dim=0) history_sep_indices_all = torch.stack(history_sep_indices_all, dim=0) len_question_gr = torch.tensor(question_edge_indices_all.size(-1)).unsqueeze(0).repeat(num_options, 1) len_history_gr = torch.tensor(history_edge_index_all.size(-1)).repeat(num_options, 1) len_history_sep = torch.tensor(history_sep_indices_all.size(-1)).repeat(num_options, 1) item = {} item['tokens'] = tokens_all.unsqueeze(0) item['segments'] = segments_all.unsqueeze(0) item['sep_indices'] = sep_indices_all.unsqueeze(0) item['mask'] = mask_all.unsqueeze(0) item['hist_len'] = hist_len_all.unsqueeze(0) item['question_limits'] = question_limits_all item['question_edge_indices'] = question_edge_indices_all item['question_edge_attributes'] = question_edge_attributes_all item['history_edge_indices'] = history_edge_index_all item['history_sep_indices'] = history_sep_indices_all if self.config['stack_gr_data']: item['len_question_gr'] = len_question_gr item['len_history_gr'] = len_history_gr item['len_history_sep'] = len_history_sep item['round_id'] = torch.LongTensor([dialog['round_id']]) # get image features if not self.config['dataloader_text_only']: features, num_boxes, boxes, _ , image_target, image_edge_indexes, image_edge_attributes = self._image_features_reader[img_id] features, spatials, image_mask, image_target, image_label = encode_image_input(features, num_boxes, boxes, image_target, max_regions=self._max_region_num, mask_prob=0) else: features = spatials = image_mask = image_target = image_label = torch.tensor([0]) item['image_feat'] = features item['image_loc'] = spatials item['image_mask'] = image_mask item['image_target'] = image_target item['image_label'] = image_label item['image_id'] = torch.LongTensor([img_id]) if self._split == 'train': # cheap hack to account for the graph data for the postitive and negatice examples item['image_edge_indices'] = [torch.from_numpy(image_edge_indexes).long(), torch.from_numpy(image_edge_indexes).long()] item['image_edge_attributes'] = [torch.from_numpy(image_edge_attributes), torch.from_numpy(image_edge_attributes)] elif self._split == 'val': # cheap hack to account for the graph data for the postitive and negatice examples item['image_edge_indices'] = [torch.from_numpy(image_edge_indexes).contiguous().long() for _ in range(1000)] item['image_edge_attributes'] = [torch.from_numpy(image_edge_attributes).contiguous() for _ in range(1000)] else: # cheap hack to account for the graph data for the postitive and negatice examples item['image_edge_indices'] = [torch.from_numpy(image_edge_indexes).contiguous().long() for _ in range(100)] item['image_edge_attributes'] = [torch.from_numpy(image_edge_attributes).contiguous() for _ in range(100)] if self.config['stack_gr_data']: item['image_edge_indices'] = torch.stack(item['image_edge_indices'], dim=0) item['image_edge_attributes'] = torch.stack(item['image_edge_attributes'], dim=0) len_image_gr = torch.tensor(item['image_edge_indices'].size(-1)).unsqueeze(0).repeat(num_options) item['len_image_gr'] = len_image_gr return item