initial commit

2024-04-29 17:18:10 +02:00 · 2024-04-29 17:18:10 +02:00 · 8e0fd07853
commit 8e0fd07853
parent 449dff858d
10 changed files with 4550 additions and 1 deletions
--- a/README.md
+++ b/README.md
@ -1,3 +1,66 @@
 # VSA4VQA
-Official code for "VSA4VQA: Scaling a Vector Symbolic Architecture to Visual Question Answering on Natural Images" published at CogSci'24
+Official code for [VSA4VQA: Scaling a Vector Symbolic Architecture to Visual Question Answering on Natural Images](https://perceptualui.org/publications/penzkofer24_cogsci/) published at CogSci'24
 ## Installation
 ```shell 
 # create environment
 conda create -n ssp_env python=3.9 pip
 conda activate ssp_env
 conda install pytorch torchvision pytorch-cuda=11.8 -c pytorch -c nvidia -y
 sudo apt install libmysqlclient-dev
 # install requirements
 pip install -r requirements.txt
 # install CLIP 
 pip install git+https://github.com/openai/CLIP.git
 # setup jupyter notebook kernel
 python -m ipykernel install --user --name=ssp_env
 ```
 ## Get GQA Programs
 using code by [https://github.com/wenhuchen/Meta-Module-Network](https://github.com/wenhuchen/Meta-Module-Network)<br>
 - Download github repo MMN
 - Add `gqa-questions` folder with GQA json files
 - Run Preprocessing  
 `python preprocess.py create_balanced_programs`
 - Save generated programs to data folder: 
 ```
 testdev_balanced_inputs.json
 trainval_balanced_inputs.json
 testdev_balanced_programs.json  
 trainval_balanced_programs.json
 ```
 > GQA dictionaries: `gqa_all_attributes.json` and `gqa_all_vocab_classes` are also adapted from [https://github.com/wenhuchen/Meta-Module-Network](https://github.com/wenhuchen/Meta-Module-Network)
 ## Generate Query Masks 
 - generates full_relations_df.pkl if not already present
 - generates query masks for all relations with more than 1000 samples
 ```shell 
 python generate_query_masks.py 
 ``` 
 ## Pipeline 
 Execute Pipeline for all samples in GQA: train_balanced (with `TEST=False`) or validation_balanced (with `TEST=True`)
 ```shell 
 python run_programs.py 
 ``` 
 For visualizing samples see [code/GQA_PIPELINE.ipynb](code/GQA_PIPELINE.ipynb) <br>
 For generating figures see [code/GQA_EVAL.ipynb](code/GQA_EVAL.ipynb) <br>
 ## Citation 
 Please consider citing this paper if you use VSA4VQA or parts of this publication in your research:
 ```
@inproceedings{penzkofer24_cogsci,
  author = {Penzkofer, Anna and Shi, Lei and Bulling, Andreas},
  title = {VSA4VQA: Scaling A Vector Symbolic Architecture To Visual Question Answering on Natural Images},
  booktitle = {Proc. 46th Annual Meeting of the Cognitive Science Society (CogSci)},
  year = {2024},
  pages = {}
 }
 ```
--- a/VSA4VQA_examples.ipynb
+++ b/VSA4VQA_examples.ipynb
--- a/dataset.py
+++ b/dataset.py
@ -0,0 +1,287 @@
 import os 
 import cv2
 import torch
 from torch.utils.data import Dataset
 import numpy as np 
 import matplotlib.pyplot as plt 
 import matplotlib.colors as mcolors
 import matplotlib.patches as patches
 import nengo.spa as spa
 from utils import encode_point_multidim, ssp_to_loc_multidim, bb_intersection_over_union
 RGB_COLORS = []
 for name, hex in mcolors.cnames.items():
    RGB_COLORS.append(mcolors.to_rgb(hex))
 class MNISTQueryDataset(Dataset):
    """MNIST spatial query dataset."""
    def __init__(self, mnist_data, num_imgs, img_size=120, visualize=False, transform=None, seed=42):
        # Set random seed for location and mnist image selection
        np.random.seed(seed)
        torch.manual_seed(seed)
        self.mnist_data = mnist_data 
        self.mnist_size = mnist_data[0][0].squeeze().numpy().shape[0]
        # Shuffle MNIST data set according to random seed
        self.mnist_indices = torch.randperm(len(self.mnist_data))  
        self.num_imgs = num_imgs
        self.img_size = img_size
        self.border = self.img_size - self.mnist_size
        self.visualize = visualize
        self.transform = transform
    def __len__(self):
        return len(self.mnist_data) // self.num_imgs 
    def __getitem__(self, idx):
        current_indices = self.mnist_indices[idx: idx + self.num_imgs]
        image = np.zeros((self.img_size, self.img_size))
        mask = np.ones((self.border, self.border))
        labels = []
        mnist_imgs = []
        for i in current_indices:
            mnist, label = self.mnist_data[i]
            mnist_imgs.append(mnist.squeeze().numpy())
            # find available space
            indices = np.where(mask == 1)[:2]
            coords = np.transpose(indices)
            # pick random pixel as x0, y0
            idx = np.random.randint(len(indices[0]))
            y_pos, x_pos = coords[idx]
            # add mnist to image 
            image[y_pos: y_pos+self.mnist_size, x_pos: x_pos+self.mnist_size] = mnist.squeeze().numpy()
            # position label = center of mnist image 
            labels.append(dict({label: (x_pos + self.mnist_size // 2, y_pos + self.mnist_size // 2)}))
            # remove available space 
            for x in np.arange(max(0, x_pos-self.mnist_size), min(x_pos+self.mnist_size+1, self.border)): 
                for y in np.arange(max(0, y_pos-self.mnist_size), min(y_pos+self.mnist_size+1, self.border)): 
                    mask[y, x] = 0
            # visualize image state and current mask 
            if self.visualize: 
                f, (ax1, ax2) = plt.subplots(1, 2, sharey=False)
                ax1.imshow(image, cmap='gray')
                ax2.imshow(mask, cmap='gray')
                plt.show()
        sample = {'image': image, 'labels': labels, 'mnist_images': mnist_imgs}
        #if self.transform:
        #    sample = self.transform(sample)
        return sample
 class GQADataset():
    def __init__(self, questions, programs, scenegraphs, vectors, axes, linspace, 
                 path='GQA/images/images/', seed=17, verbose=0, visualize=False):
        np.random.seed(seed)
        self.questions = questions
        self.programs = programs
        self.scenegraphs = scenegraphs
        self.vis = visualize
        self.verbose = verbose
        self.seed = seed 
        self.path = path
        # vector space 
        self.ssp_vectors = vectors
        self.ssp_axes = axes #[x_axis, y_axis, w_axis, h_axis]
        self.linspace = linspace # [xs, ys, ws, hs]
    def __len__(self):
        return len(self.questions)
    def __get_item__(self, idx):
        q_id = self.questions.iloc[idx].questionID
        temp_df = self.questions.loc[self.questions.questionID == q_id]
        # get image
        img_id = temp_df.imageId.values[0]
        img_path = os.path.join(self.path, f'{img_id}.jpg') 
        assert os.path.exists(img_path), f'Image path {img_path} does not exist!'
        img = cv2.imread(img_path)
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        # get question and answer
        question = temp_df.question.values[0]
        answer = temp_df.answer.values[0]
        full_answer = temp_df.fullAnswer.values[0]
        # get program
        idx = self.programs.loc[self.programs.questionID == q_id].index[0]
        program = self.programs.iloc[idx].program 
        info = {'q_id': q_id, 'img_id': img_id, 'question': question, 'answer': answer, 
                'full_answer': full_answer, 'program': program}
        return img, info
    def encode_item(self, idx, new_size=(25, 25), dim=1024):
        """ Encode all objects in image into SSP memory = vector space. 
        ensure x- and y-axis of SSP memory have same resolution 
        and fixed width & height axes (10,10), no zero values for width & height
        and int values instead of decimals, otherwise decoding accuracy degrades
        """
        img, info = self.__get_item__(idx)
        sg_data = self.scenegraphs.get(str(info['img_id'])).get('objects')
        img_size = img.shape[:2]
        # find orientation and select scale to fit into quadratic vector space
        if img_size[1] / 2 < img_size[0]:
            scale = img_size[0] / new_size[0]
        else:
            scale = img_size[1] / new_size[1]
        # scale width and height to fixed size of 10 
        w_scale, h_scale = img_size[1] / 10, img_size[0] / 10 
        encoded_items = {}
        encoded_ssps = {}
        rng = np.random.RandomState(seed=self.seed)
        memory = spa.SemanticPointer(data=np.zeros(dim), rng=rng) 
        name_lst = []
        if self.vis: 
            print(f'Original image {img_size[0]}x{img_size[1]} --> {int(img_size[0] / scale)}x{int(img_size[1] / scale)}')
            fig, ax = plt.subplots(1,1)
            ax.imshow(img, interpolation='none', origin='upper', extent=[0, img_size[1] / scale, img_size[0] / scale, 0])
            plt.axis('off')
        for i, obj in enumerate(sg_data.items()):
            id_num, obj_dict = obj
            name = obj_dict.get('name')
            #name = singularize(name)
            name_lst.append(name)
            name += '_' + str(name_lst.count(name))
            # extract ground truth data and scale to fit to SSPs 
            x, y, width, height  = obj_dict.get('x'), obj_dict.get('y'), obj_dict.get('w'), obj_dict.get('h')
            x, y, width, height  = x / scale, y / scale, width / w_scale, height / h_scale
            width = width if width >= 1 else 1
            height = height if height >= 1 else 1
            # Round values to next int (otherwise decoding gets buggy)
            item = np.round([x, y, width, height], decimals=0).astype(int)
            encoded_items[name] = item
            pos = encode_point_multidim(list(item), self.ssp_axes)
            ssp = spa.SemanticPointer(dim)
            encoded_ssps[name] = ssp 
            memory += ssp * pos 
            if self.vis: 
                x, y, width, height = item
                width, height = (width * w_scale) / scale, (height * h_scale) / scale 
                rect = patches.Rectangle((x, y),
                                         width, height,
                                         linewidth = 2,
                                         label = name,
                                         edgecolor = RGB_COLORS[i],
                                         facecolor = 'none')
                ax.add_patch(rect)
        if self.vis: 
            plt.show()
        info['encoded_items'] = encoded_items
        info['encoded_ssps'] = encoded_ssps
        info['scales'] = [scale, w_scale, h_scale]
        return img, info, memory 
    def decode_item(self, img, info, memory):
        img_size = img.shape[:2]
        scale, w_scale, h_scale = info['scales']
        if self.vis:
            fig, ax = plt.subplots(1,1)
            ax.imshow(img, interpolation='none', origin='upper', extent=[0, img_size[1] / scale, img_size[0] / scale, 0])
            plt.axis('off')
        errors = []
        iou_lst = []
        iou_binary_lst = []
        for i, (name, data) in enumerate(info['encoded_items'].items()):
            ssp_item = info['encoded_ssps'][name]
            item_decoded = memory *~ ssp_item
            clean_loc = ssp_to_loc_multidim(item_decoded, self.ssp_vectors, self.linspace)
            x, y, width, height = clean_loc
            mse = np.square(np.subtract(data[:2], clean_loc[:2])).mean()
            errors.append(mse)
            width, height = width * w_scale / scale, height * h_scale / scale 
            bb_gt = np.array([data[0], data[1], data[0]+(data[2] * w_scale / scale), data[1]+(data[3] * h_scale / scale)]) 
            iou = bb_intersection_over_union(bb_gt, [x, y, x+width, y+height])
            iou_lst.append(iou)
            if iou > 0.5: 
                iou_binary_lst.append(1)
            else: 
                iou_binary_lst.append(0)
            if self.vis:
                rect = patches.Rectangle((x, y),
                                         width, height,
                                         linewidth = 2,
                                         label = name,
                                         edgecolor = RGB_COLORS[i],
                                         facecolor = 'none')
                ax.add_patch(rect)
        if self.vis:
            plt.legend(loc='upper left', bbox_to_anchor=(1., 1.02))
            plt.show()
        avg_mse = np.mean(errors)
        avg_iou = np.mean(iou_lst) 
        if self.verbose > 0: 
            print(f'Average mean-squared error of 2D locations: {avg_mse:.4f}')
            print(f'Average IoU of 4D bounding boxes: {avg_iou:.2f}')
            print(f'Correct items: {np.sum(iou_binary_lst)} / {len(info["encoded_items"])}')
        return avg_mse, avg_iou, np.sum(iou_binary_lst)
    def print_item(self, idx):
        _, info = self.__get_item__(idx)
        print(f"Question #{info['q_id']}: \n{info['question']}")
        print(f"[{info['answer']}] {info['full_answer']}\n")
        print('Program:')
        for i, step in enumerate(info['program']): 
            num, func = step.split('=')
            print(f'{i}. {func}')
        print()
    def set_visualize(self, visualize):
        self.vis = visualize
    def set_verbose(self, verbose):
        self.verbose = verbose
--- a/generate_query_masks.py
+++ b/generate_query_masks.py
@ -0,0 +1,300 @@
 import os 
 import time
 import json
 import queue
 from multiprocessing import Process, Queue
 from multiprocessing.pool import Pool
 from tqdm import tqdm
 import pandas as pd 
 import numpy as np 
 import matplotlib.pyplot as plt 
 DATA_PATH = 'GQA/'
 REL_PATH = 'full_relations_df.pkl'
 IMG_SIZE = (500, 500)
 NUM_PROCESSES = 20
 NUM_SAMPLES = 100
 def bbox_to_mask(x, y, w, h, img_size=IMG_SIZE, name=None, visualize=False):
    img = np.zeros(img_size)
    mask_w = np.ones(np.clip(w, 0, img_size[1]-x))
    for j in range(y, np.clip(y+h, 0, img_size[0])):
        img[j][x:x+w] = mask_w
    if visualize:   
        fig = plt.figure(figsize=(img_size[0] // 80, img_size[1] // 80))
        plt.imshow(img, cmap='gray')
        if name: 
            plt.title(name)
        plt.axis('off')
        plt.show()
    return img
 def get_all_relations_df(data):
 	print(f'Length of scenegraph data set: {len(data)}')
 	start = time.time()
 	df = pd.DataFrame(columns=['image_id', 'relation', 'from', 'to', 'obj_loc', 'obj_w', 'obj_h', 'obj_center', 
                               'rel_obj_loc', 'rel_obj_w', 'rel_obj_h'])       
 	for img_id in data.keys():
 		all_objects = data.get(str(img_id)).get('objects').items()
  		# get all object names 
 		all_objects_dict = {id_num: (obj_dict.get('name'), obj_dict.get('x'), obj_dict.get('y'), obj_dict.get('w'), obj_dict.get('h')) 
                      		for (id_num, obj_dict) in all_objects}
 		# get all relations
 		for obj in all_objects:    
 			id_num, obj_dict = obj
 			name = obj_dict.get('name')
 			x, y, width, height  = obj_dict.get('x'), obj_dict.get('y'), obj_dict.get('w'), obj_dict.get('h')
 			center = [x + width / 2, y + height / 2]
 			for relation in obj_dict.get('relations'):
 				rel = relation.get('name')
 				rel_obj, rel_x, rel_y, rel_w, rel_h = all_objects_dict.get(relation.get('object'))
 				temp = pd.DataFrame.from_dict([{'image_id': img_id, 'relation': rel, 'from': name, 'to': rel_obj, 
 												'obj_loc': [x, y], 'obj_w': width, 'obj_h': height, 'center': center,
 												'rel_obj_loc': [rel_x, rel_y], 'rel_obj_w': rel_w, 'rel_obj_h': rel_h}])
 				df = pd.concat([df,  temp], ignore_index=True)
 				#print(f'{df.iloc[-1]["from"]} {df.iloc[-1].relation} {df.iloc[-1].to}')
 	out_path = 'all_relations.pkl'
 	df.to_pickle(out_path)
 	print(f'Saved df to {out_path}')
 	end = time.time()
 	elapsed = end - start 
 	print(f'Took {int(elapsed // 60)}:{int(elapsed % 60)} min:s for all {len(df)} relations --> {elapsed / len(df):.2f}s / relation')
 def generate_query_mask(df, rel, i, img_center=np.array([250, 250]), uni_size=np.array([50, 50])):
    # uni_obj only needed for visualization in the end 
    uni_obj = bbox_to_mask(img_center[0] - (uni_size[0] // 2), img_center[1] - (uni_size[1] // 2), 
                       50, 50, img_size=(500, 500))
    temp_df = df.loc[df.relation == rel]
    print(f'[{i}] Number of "{rel}" samples: {len(temp_df)}')
    query_mask = np.zeros((500, 500), dtype=np.uint8)
    counter = 0 
    num_discard = 0 
    for idx in range(len(temp_df)):       
        if counter >= NUM_SAMPLES: 
            print(f'[{i}] Reached {counter} samples for relation "{rel}":')
            break 
        img_id = temp_df.iloc[idx].image_id
        img_size = (data.get(img_id)['height'], data.get(img_id)['width'])
        # get relative object info and generate binary mask
        obj_loc = temp_df.iloc[idx].rel_obj_loc
        width = temp_df.iloc[idx].rel_obj_w
        height = temp_df.iloc[idx].rel_obj_h
        # get mask info and generate binary mask
        mask_loc = temp_df.iloc[idx].obj_loc
        mask_w = temp_df.iloc[idx].obj_w
        mask_h = temp_df.iloc[idx].obj_h
        if obj_loc[0] > img_size[1] or obj_loc[1] > img_size[0] or mask_loc[0] > img_size[1] or mask_loc[1] > img_size[0]:
            #print('error in bounding box -- discard sample')
            continue 
        obj = bbox_to_mask(obj_loc[0], obj_loc[1], width, height, img_size=img_size)
        mask = bbox_to_mask(mask_loc[0], mask_loc[1], mask_w, mask_h, img_size=img_size)
        img = obj*2 + mask 
        img_transformed = np.zeros((1000, 1000), dtype=np.uint8)
        # scale image first 
        scale_x, scale_y = uni_size[0] / width, uni_size[1] / height
        scale_mat = np.array([[scale_y, 0, 0], [0, scale_x, 0], [0, 0, 1]])
        if scale_x > 5 or scale_y > 5: 
            num_discard += 1
            #print(f'Scale is too high! x: {scale_x}, y: {scale_y} -- discard sample')
            continue 
        for i, row in enumerate(img):
            for j, col in enumerate(row):
                pixel_data = img[i, j]
                input_coords = np.array([i, j, 1])
                i_out, j_out, _ = scale_mat @ input_coords
                if i_out > 0 and i_out < 1000 and j_out > 0 and j_out < 1000 and pixel_data > 0: 
                    # new indices must be within new image -- discard others 
                    img_transformed[int(i_out), int(j_out)] = pixel_data                      
        if not len(np.where(img_transformed >= 2)[0]) > 0: 
            # no data in transformed image -- discard sample
            continue
        # find new (x, y) location of object
        new_loc = sorted([[y, x] for (y, x) in zip(*np.where(img_transformed >= 2))])[0]
        new_center = [new_loc[0] + uni_size[0] // 2, new_loc[1] + uni_size[1] // 2]
        # move object to center 
        move_x, move_y = img_center - new_center 
        move_mat = np.array([[1, 0, move_x], [0, 1, move_y], [0, 0, 1]])
        img_moved = np.zeros((500, 500), dtype=np.uint8)
        for i, row in enumerate(img_transformed):
            for j, col in enumerate(row):
                pixel_data = img_transformed[i, j]
                input_coords = np.array([i, j, 1])
                i_out, j_out, _ = move_mat @ input_coords
                if i_out > 0 and i_out < 500 and j_out > 0 and j_out < 500 and pixel_data > 0: 
                    # new indices must be within new image -- discard others 
                    img_moved[int(i_out), int(j_out)] = pixel_data
        # extract relative object mask and add to query mask 
        mask_transformed = np.where(img_moved==1, img_moved, 0) + np.where(img_moved==3, img_moved, 0)
        query_mask += mask_transformed
        counter += 1
    if counter > 0: 
        query_mask = query_mask / counter
        rel_name = '_'.join(rel.split(' ')) 
        np.save(f'relations/{rel_name}.npy', query_mask)
        print(f'[{i}] Saved query mask to: relations/{rel_name}.npy')
        if num_discard > 0: 
            print(f'[{i}] Discarded {num_discard} samples, because scaling was too high.')
        plt.figure(figsize=(3,3))
        plt.imshow(uni_obj*0.1+ query_mask, cmap='gray')
        plt.title(rel)
        plt.axis('off')
        plt.savefig(f'relations/{rel_name}.png', bbox_inches='tight', dpi=300)
        plt.clf()
    else: 
        print(f'[{i}] Could not generate query mask for "{rel}"')
 def run_process(tasks, df):
    while True:
        try:
            '''
                try to get task from the queue. get_nowait() function will 
                raise queue.Empty exception if the queue is empty. 
                queue(False) function would do the same task also.
            '''
            task = tasks.get_nowait()
            i = list(df.relation.unique()).index(task)
        except queue.Empty:
            break
        else:
            ''' no exception has been raised '''
            print(f'[{i}] Starting relation #{i}: {task}')
            print()
            generate_query_mask(df, task, i)
            time.sleep(.5)
    return True
 # task executed in a worker process
 def get_relations_task(img_id):
    width, height = data.get(str(img_id))['width'], data.get(str(img_id))['height']
    all_objects = data.get(str(img_id)).get('objects').items()
    # get all object names 
    all_objects_dict = {id_num: (obj_dict.get('name'), obj_dict.get('x'), obj_dict.get('y'), obj_dict.get('w'), obj_dict.get('h')) 
                        for (id_num, obj_dict) in all_objects}
    all_relations = []
    # get all relations
    for obj in all_objects:    
        id_num, obj_dict = obj
        name = obj_dict.get('name')
        x, y, obj_w, obj_h  = obj_dict.get('x'), obj_dict.get('y'), obj_dict.get('w'), obj_dict.get('h')
        center = [x + width / 2, y + height / 2]
        for relation in obj_dict.get('relations'):
            rel = relation.get('name')
            rel_obj, rel_x, rel_y, rel_w, rel_h = all_objects_dict.get(relation.get('object'))
            all_relations.append({'image_id': img_id, 'width': width, 'height': height, 'relation': rel, 
                                'from': name, 'to': rel_obj, 'obj_loc': [x, y], 'obj_w': obj_w, 'obj_h': obj_h, 
                                'obj_center': center,'rel_obj_loc': [rel_x, rel_y], 'rel_obj_w': rel_w, 'rel_obj_h': rel_h})
    return all_relations
 if __name__ == '__main__': 
    path = os.path.join(DATA_PATH, 'train_sceneGraphs.json')
    assert os.path.exists(path), f'{path} does not exist!'
    with open(os.path.join(DATA_PATH, 'train_sceneGraphs.json'), 'r') as f:
        data = json.load(f)
    print(f'Length of scenegraph data set: {len(data)}')
    if not os.path.exists(REL_PATH):    
        print('Generating dataframe of all relations...')
        # generate list of relations pkl -- use multiprocessing!
        # create and configure the process pool
        with Pool(processes=NUM_PROCESSES) as pool:
            df = pd.DataFrame(columns=['image_id', 'width', 'height', 'relation', 'from', 'to', 'obj_loc', 'obj_w', 
                                       'obj_h', 'obj_center', 'rel_obj_loc', 'rel_obj_w', 'rel_obj_h'])       
            # execute tasks in order
            for i, result in enumerate(tqdm(pool.map(get_relations_task, list(data.keys()), chunksize=100))):
                temp = pd.DataFrame.from_dict(result)
                df = pd.concat([df,  temp], ignore_index=True)
                if i % 10000 == 0: 
                    df.to_pickle('temp_' + REL_PATH)
                    print(f'Saved df to {"temp_" + REL_PATH}')
        df.to_pickle(REL_PATH)
        print(f'Saved df to {REL_PATH}')
    else: 
        df = pd.read_pickle(REL_PATH)
    print(f'Number of relations: {len(df.relation.unique())}')
    print(df.relation.unique())
    # generate query mask for each relation
    #for i, rel in enumerate(df.relation.unique()): 
    #	generate_query_mask(df, rel, i)
    print('Generating a query mask for each relation...')
    # generate query mask for each relation -- use multiprocessing 
    tasks = Queue()
    procs = []
    # only use relations with at least 1000 samples 
    rel_lst = df.relation.value_counts()[df.relation.value_counts() > 1000].index.to_list()
    for rel in rel_lst: 
        tasks.put(rel)
    # creating processes -- run only NUM_PROCESSES processes at the same time 
    for _ in range(NUM_PROCESSES):
        p = Process(target=run_process, args=(tasks, df,))
        procs.append(p)
        p.start()
    # completing all processes
    for p in procs:
        p.join()
--- a/gqa_all_attributes.json
+++ b/gqa_all_attributes.json
@ -0,0 +1,222 @@
 {
    "color": [
        "beige",
        "black",
        "blond",
        "blue",
        "brown",
        "brunette",
        "cream colored",
        "dark",
        "dark blue",
        "dark brown",
        "gold",
        "gray",
        "green",
        "khaki",
        "light blue",
        "light brown",
        "maroon",
        "orange",
        "pink",
        "purple",
        "red",
        "silver",
        "tan",
        "teal",
        "white",
        "yellow"
    ],
    "pose": [
        "bending",
        "brushing tooth",
        "crouching",
        "jumping",
        "lying",
        "making a face",
        "pointing",
        "running",
        "shaking hand",
        "sitting",
        "standing",
        "taking a photo",
        "taking a picture",
        "taking picture",
        "walking"
    ],
    "material": [
        "brick",
        "concrete",
        "glass",
        "leather",
        "metal",
        "plastic",
        "porcelain",
        "wood"
    ],
    "activity": [
        "brushing tooth",
        "cooking",
        "drinking",
        "driving",
        "eating",
        "looking down",
        "looking up",
        "playing",
        "posing",
        "reading",
        "resting",
        "sleeping",
        "staring",
        "talking",
        "waiting"
    ],
    "weather": [
        "clear",
        "cloudless",
        "cloudy",
        "foggy",
        "overcast",
        "partly cloudy",
        "rainy",
        "stormy",
        "sunny"
    ],
    "size": [
        "giant",
        "huge",
        "large",
        "little",
        "small",
        "tiny"
    ],
    "fatness": [
        "fat",
        "skinny",
        "thin"
    ],
    "gender": [
        "female",
        "male"
    ],
    "height": [
        "short",
        "tall"
    ],
    "state": [
        "calm",
        "choppy",
        "rough",
        "smooth",
        "still",
        "wavy"
    ],
    "hposition": [
        "left",
        "right"
    ],
    "length": [
        "long",
        "short"
    ],
    "shape": [
        "octagonal",
        "rectangular",
        "round",
        "square",
        "triangular"
    ],
    "pattern": [
        "checkered",
        "dotted",
        "striped"
    ],
    "thickness": [
        "thick",
        "thin"
    ],
    "age": [
        "little",
        "old",
        "young"
    ],
    "tone": [
        "light",
        "dark"
    ],
    "room": [
        "attic",
        "bathroom",
        "bedroom",
        "dining room",
        "kitchen",
        "living room",
        "office"
    ],
    "width": [
        "narrow",
        "wide"
    ],
    "depth": [
        "deep",
        "shallow"
    ],
    "cleanliness": [
        "clean",
        "dirty",
        "stained",
        "tinted"
    ],
    "hardness": [
        "hard",
        "soft"
    ],
    "race": [
        "asian",
        "caucasian"
    ],
    "company": [
        "adida",
        "nike"
    ],
    "sportActivity": [
        "performing trick",
        "riding",
        "skateboarding",
        "skating",
        "skiing",
        "snowboarding",
        "surfing",
        "swimming"
    ],
    "sportactivity": [
        "performing trick",
        "riding",
        "skateboarding",
        "skating",
        "skiing",
        "snowboarding",
        "surfing",
        "swimming"
    ],
    "weight": [
        "heavy",
        "light"
    ],
    "texture": [
        "coarse",
        "fine"
    ],
    "flavor": [
        "chocolate",
        "strawberry",
        "vanilla"
    ],
    "realism": [
        "fake",
        "real"
    ],
    "face expression": [
        "making a face"
    ]
 }
--- a/gqa_all_relations_map.json
+++ b/gqa_all_relations_map.json
@ -0,0 +1,314 @@
 {
 "to the left of": "to the left of",
 "to the right of": "to the right of",
 "on": "on",
 "wearing": "wearing",
 "of": "of",
 "near": "near",
 "in": "in",
 "behind": "behind",
 "in front of": "in front of",
 "holding": "holding",
 "on top of": "on top of",
 "next to": "next to",
 "above": "above",
 "with": "with",
 "below": "below",
 "by": "by",
 "sitting on": "sitting on",
 "under": "under",
 "on the side of": "on the side of",
 "beside": "beside",
 "standing on": "standing on",
 "inside": "inside",
 "carrying": "carrying",
 "at": "at",
 "walking on": "walking on",
 "riding": "riding",
 "standing in": "standing in",
 "covered by": "covered by",
 "around": "around",
 "lying on": "lying on",
 "hanging on": "hanging on",
 "eating": "eating",
 "watching": "watching",
 "looking at": "looking at",
 "covering": "covering",
 "sitting in": "sitting in",
 "on the front of": "on the front of",
 "hanging from": "hanging on",
 "parked on": "on",
 "riding on": "on",
 "using": "holding",
 "covered in": "covered by",
 "flying in": "sitting in",
 "sitting at": "sitting in",
 "playing with": "holding",
 "full of": "carrying",
 "filled with": "carrying",
 "walking in": "walking on",
 "crossing": "walking on",
 "on the back of": "behind",
 "surrounded by": "inside",
 "swinging": "sitting in",
 "standing next to": "next to",
 "reflected in": "near",
 "covered with": "covered by",
 "touching": "holding",
 "flying": "near",
 "pulling": "holding",
 "pulled by": "next to",
 "contain": "carrying",
 "hitting": "holding",
 "leaning on": "next to",
 "lying in": "lying on",
 "standing by": "next to",
 "driving on": "walking on",
 "throwing": "near",
 "sitting on top of": "on top of",
 "surrounding": "around",
 "underneath": "below",
 "walking down": "walking on",
 "parked in": "standing in",
 "growing in": "sitting in",
 "standing near": "near",
 "growing on": "sitting on",
 "standing behind": "behind",
 "playing": "holding",
 "printed on": "on",
 "mounted on": "on",
 "beneath": "below",
 "attached to": "next to",
 "talking on": "on",
 "facing": "looking at",
 "leaning against": "next to",
 "cutting": "holding",
 "driving": "sitting in",
 "worn on": "on",
 "resting on": "on",
 "floating in": "in",
 "lying on top of": "on top of",
 "catching": "holding",
 "grazing on": "standing on",
 "on the bottom of": "below",
 "drinking": "holding",
 "standing in front of": "in front of",
 "topped with": "on top of",
 "playing in": "inside",
 "walking with": "with",
 "swimming in": "inside",
 "driving down": "walking on",
 "hanging over": "above",
 "pushed by": "next to",
 "pushing": "holding",
 "playing on": "on",
 "sitting next to": "next to",
 "close to": "near",
 "feeding": "holding",
 "waiting for": "near",
 "between": "next to",
 "running on": "walking on",
 "tied to": "next to",
 "on the edge of": "on top of",
 "talking to": "next to",
 "holding onto": "holding",
 "eating from": "holding",
 "perched on": "on",
 "reading": "holding",
 "parked by": "next to",
 "painted on": "on",
 "reaching for": "holding",
 "sleeping on": "lying on",
 "connected to": "next to",
 "grazing in": "in",
 "hanging above": "above",
 "floating on": "on",
 "wrapped around": "around",
 "stacked on": "on",
 "skiing on": "walking on",
 "parked at": "next to",
 "standing at": "next to",
 "hanging in": "hanging on",
 "parked near": "near",
 "walking across": "walking on",
 "plugged into": "next to",
 "standing beside": "beside",
 "parked next to": "next to",
 "working on": "on",
 "stuck on": "on",
 "stuck in": "in",
 "drinking from": "holding",
 "seen through": "in front of",
 "kicking": "near",
 "sitting by": "by",
 "sitting in front of": "in front of",
 "looking out": "behind",
 "petting": "holding",
 "parked in front of": "in front of",
 "wrapped in": "covered by",
 "flying over": "above",
 "selling": "holding",
 "lying inside": "lying on",
 "coming from": "near",
 "parked along": "standing on",
 "serving": "holding",
 "sitting inside": "inside",
 "sitting with": "with",
 "walking by": "by",
 "standing under": "below",
 "making": "holding",
 "walking through": "walking on",
 "standing on top of": "on top of",
 "hung on": "below",
 "walking along": "by",
 "walking near": "near",
 "going down": "walking on",
 "flying through": "near",
 "running in": "walking on",
 "leaving": "near",
 "mounted to": "on top of",
 "sitting behind": "behind",
 "on the other side of": "on the side of",
 "licking": "holding",
 "riding in": "riding",
 "followed by": "by",
 "following": "by",
 "sniffing": "looking at",
 "biting": "with",
 "parked alongside": "by",
 "flying above": "above",
 "chasing": "near",
 "leading": "near",
 "boarding": "near",
 "hanging off": "below",
 "walking behind": "behind",
 "parked behind": "behind",
 "sitting near": "near",
 "helping": "holding",
 "parked beside": "beside",
 "growing near": "near",
 "sitting under": "below",
 "coming out of": "in front of",
 "sitting beside": "beside",
 "hanging out of": "hanging on",
 "served on": "on",
 "staring at": "looking at",
 "walking toward": "near",
 "hugging": "carrying",
 "skiing in": "in",
 "entering": "in front of",
 "looking in": "looking at",
 "draped over": "covering",
 "walking next to": "next to",
 "tied around": "covering",
 "growing behind": "behind",
 "exiting": "in front of",
 "balancing on": "on",
 "drawn on": "on",
 "jumping over": "above",
 "looking down at": "below",
 "looking into": "looking at",
 "reflecting in": "in front of",
 "posing with": "with",
 "eating at": "at",
 "sewn on": "on",
 "walking up": "walking on",
 "leaning over": "on the side of",
 "about to hit": "holding",
 "reflected on": "in front of",
 "approaching": "near",
 "getting on": "on",
 "observing": "watching",
 "growing next to": "next to",
 "traveling on": "on",
 "walking towards": "near",
 "growing by": "by",
 "displayed on": "on",
 "wading in": "standing in",
 "growing along": "beside",
 "mixed with": "covered by",
 "grabbing": "holding",
 "jumping on": "walking on",
 "scattered on": "on",
 "opening": "holding",
 "climbing": "walking on",
 "pointing at": "at",
 "preparing": "holding",
 "coming down": "above",
 "decorated by": "by",
 "decorating": "on",
 "taller than": "than",
 "going into": "standing in",
 "growing from": "on",
 "tossing": "holding",
 "eating in": "in",
 "sleeping in": "inside",
 "herding": "near",
 "chewing": "eating",
 "washing": "holding",
 "looking through": "looking at",
 "picking up": "holding",
 "trying to catch": "holding",
 "working in": "in",
 "slicing": "holding",
 "skiing down": "walking on",
 "looking over": "looking at",
 "standing against": "next to",
 "typing on": "on",
 "piled on": "on",
 "lying next to": "next to",
 "tying": "standing on",
 "smiling at": "looking at",
 "smoking": "holding",
 "cleaning": "carrying",
 "shining through": "behind",
 "guiding": "near",
 "walking to": "near",
 "chained to": "next to",
 "dragging": "carrying",
 "cooking": "holding",
 "going through": "holding",
 "enclosing": "covering",
 "smelling": "eating",
 "adjusting": "holding",
 "photographing": "looking at",
 "skating on": "walking on",
 "running through": "walking on",
 "decorated with": "with",
 "kissing": "next to",
 "falling off": "below",
 "walking into": "in front of",
 "blowing out": "eating",
 "walking past": "behind",
 "towing": "near",
 "worn around": "covering",
 "jumping off": "on top of",
 "sprinkled on": "on top of",
 "moving": "carrying",
 "running across": "walking on",
 "hidden by": "behind",
 "traveling down": "walking on",
 "looking toward": "looking at",
 "splashing": "near",
 "hang from": "below",
 "kept in": "inside",
 "sitting around": "sitting on",
 "displayed in": "inside",
 "cooked in": "inside",
 "sitting atop": "sitting on",
 "brushing": "holding",
 "in between": "next to",
 "buying": "holding",
 "standing around": "next to",
 "larger than": "than",
 "smaller than": "than",
 "pouring": "holding",
 "playing at": "at",
 "longer than": "than",
 "higher than": "than",
 "jumping in": "in",
 "shorter than": "than",
 "bigger than": "than"
 }
--- a/gqa_all_vocab_classes.json
+++ b/gqa_all_vocab_classes.json
--- a/requirements.txt
+++ b/requirements.txt
@ -0,0 +1,39 @@
 # Requirments for SSP VQA project 
 # Standard Libraries
 opencv-python
 ipykernel
 ipywidgets
 numpy
 pandas 
 tqdm 
 matplotlib
 imageio
 moviepy
 scikit-learn
 wandb
 torchinfo
 torchmetrics
 # Nengo Libraries
 nengo
 nbconvert>=7
 mistune>=2
 nengo-spa
 # CLIP requirements
 ftfy
 regex
 tqdm
 # DFOL-VQA Libraries
 h5py
 pyyaml
 mysqlclient
 pattern
 # NLP Libraries 
 stanza
 sexpdata
 nltk
 svgling
--- a/run_programs.py
+++ b/run_programs.py
@ -0,0 +1,968 @@
 import os 
 import sys
 os.environ["OPENBLAS_NUM_THREADS"] = "10"
 import json 
 import cv2
 import re
 import random
 import time
 import torch
 import clip
 import logging
 from datetime import datetime
 from PIL import Image
 from tqdm import tqdm
 import pandas as pd 
 import numpy as np 
 import nengo.spa as spa
 import matplotlib
 import matplotlib.pyplot as plt
 import matplotlib.patches as patches
 import matplotlib.colors as mcolors
 from collections import OrderedDict
 from pattern.text.en import singularize, pluralize
 from dataset import GQADataset
 from utils import *
 DATA_PATH = '/scratch/penzkofer/GQA'
 RGB_COLORS = []
 for name, hex in mcolors.cnames.items():
    RGB_COLORS.append(mcolors.to_rgb(hex))
 CUDA_DEVICE = 7
 torch.cuda.set_device(CUDA_DEVICE)
 device = torch.device("cuda:" + str(CUDA_DEVICE))
 clip_model, preprocess = clip.load("ViT-B/32", device=device)
 with open('gqa_all_relations_map.json') as f: 
    RELATION_DICT = json.load(f)
 with open('gqa_all_vocab_classes.json') as f: 
    CLASS_DICT = json.load(f)
 with open('gqa_all_attributes.json') as f: 
    ATTRIBUTE_DICT = json.load(f)
 SYNONYMS = {'he': ['man', 'boy'], 'she': ['woman', 'girl']}
 ANSWER_MAP = {'to the right of': 'right', 'to the left of': 'left'}
 VISUALIZE = False
 def plot_heatmap_multidim(sp, xs, ys, heatmap_vectors, name='', vmin=-1, vmax=1, cmap='plasma', invert=False):
    """adapted from https://github.com/ctn-waterloo/cogsci2019-ssp/tree/master"""
    assert sp.__class__.__name__ == 'SemanticPointer', \
    f'Queried object needs to be of type SemanticPointer but is {sp.__class__.__name__}'
    # axes: a list of axes to be summed over, first sequence applying to first tensor, second to second tensor
    vs = np.tensordot(sp.v, heatmap_vectors, axes=([0], [4]))
    res = np.unravel_index(np.argmax(vs, axis=None), vs.shape)
    plt.imshow(np.transpose(vs[:, :, res[2], res[3]]), origin='upper', interpolation='none', extent=(xs[-1], xs[0], ys[-1], ys[0]), vmin=vmin, vmax=vmax, cmap=cmap)
    plt.colorbar()
    plt.axis('off')
    plt.title(name)
    plt.show()
 def select_ssp(name, memory, encoded_ssps, vectors, linspace): 
    """decode location of object with name from SSP memory""" 
    ssp_item = encoded_ssps[name]
    item_decoded = memory *~ ssp_item
    clean_loc = ssp_to_loc_multidim(item_decoded, vectors, linspace)
    return item_decoded, clean_loc
 def clip_query(bbox, img, obj_name, clip_tokens, visualize=False):
    """Implements CLIP queries for different attributes"""
    x, y, w, h = bbox
    obj_center = (x + w / 2, y + h / 2)
    masked_img = img.copy()
    masked_img = cv2.ellipse(masked_img, (int(obj_center[0]), int(obj_center[1])), (int(w*0.7), int(h*0.7)), 
                             0, 0, 360, (255, 0, 0), 4) 
    if visualize:
        plt.imshow(masked_img)
        plt.axis('off')
        plt.show()
    masked_img = Image.fromarray(np.uint8(masked_img))
    tokens = clip.tokenize(clip_tokens)
    with torch.no_grad():
        text_features=clip_model.encode_text(tokens.to(device))
        image_features = clip_model.encode_image(preprocess(masked_img).unsqueeze(0).to(device))
    image_features /= image_features.norm(dim=-1, keepdim=True)
    text_features /= text_features.norm(dim=-1, keepdim=True)
    similarity = (100.0 * image_features @ text_features.T).softmax(dim=-1)
    indices = torch.max(similarity, 1)[1]
    similarity = similarity.squeeze()
    scores = [s.item() for s in similarity]
    pred = clip_tokens[indices.squeeze().item()]
    if visualize:
        print('CLIP')
        print(scores)
        print(pred)
    return indices.squeeze().item()
 def clip_query_scene(img, clip_tokens, verbose=0):
    """Implements CLIP queries for entire scene, i.e. no bounding box selection"""
    img = Image.fromarray(np.uint8(img))
    tokens = clip.tokenize(clip_tokens)
    with torch.no_grad():
        text_features=clip_model.encode_text(tokens.to(device))
        image_features = clip_model.encode_image(preprocess(img).unsqueeze(0).to(device))
    image_features /= image_features.norm(dim=-1, keepdim=True)
    text_features /= text_features.norm(dim=-1, keepdim=True)
    similarity = (100.0 * image_features @ text_features.T).softmax(dim=-1)
    indices = torch.max(similarity, 1)[1]
    similarity = similarity.squeeze()
    scores = [s.item() for s in similarity]
    pred = clip_tokens[indices.squeeze().item()]
    if verbose > 0: 
        print('CLIP')
        print(scores)
        print(pred)
    return indices.squeeze().item()
 def clip_choose(bbox1, bbox2, img, attribute, visualize=False):
    """Run attribute vs. not attribute check for both subjects  
    -- select clip prediction with higher confidence"""
    x1, y1, w1, h1 = bbox1
    obj_center = (x1 + w1 / 2, y1 + h1 / 2)
    masked_img1 = img.copy()
    masked_img1 = cv2.ellipse(masked_img1, (int(obj_center[0]), int(obj_center[1])), (int(w1*0.7), int(h1*0.7)), 
                             0, 0, 360, (255, 0, 0), 4) 
    x2, y2, w2, h2 = bbox2
    obj_center = (x2 + w2 / 2, y2 + h2 / 2)
    masked_img2 = img.copy()
    masked_img2 = cv2.ellipse(masked_img2, (int(obj_center[0]), int(obj_center[1])), (int(w2*0.7), int(h2*0.7)), 
                             0, 0, 360, (255, 0, 0), 4) 
    if visualize:
        plt.imshow(masked_img1)
        plt.axis('off')
        plt.show()
        plt.imshow(masked_img2)
        plt.axis('off')
        plt.show()
    masked_img1 = Image.fromarray(np.uint8(masked_img1))
    masked_img2 = Image.fromarray(np.uint8(masked_img2))
    tokens = clip.tokenize([attribute, f'not {attribute}'])
    with torch.no_grad():
        text_features=clip_model.encode_text(tokens.to(device))
        image_features = clip_model.encode_image(preprocess(masked_img1).unsqueeze(0).to(device))
    image_features /= image_features.norm(dim=-1, keepdim=True)
    text_features /= text_features.norm(dim=-1, keepdim=True)
    similarity1 = (100.0 * image_features @ text_features.T).softmax(dim=-1)
    similarity1 = similarity1.squeeze()[0]
    with torch.no_grad():
        text_features=clip_model.encode_text(tokens.to(device))
        image_features = clip_model.encode_image(preprocess(masked_img2).unsqueeze(0).to(device))
    image_features /= image_features.norm(dim=-1, keepdim=True)
    text_features /= text_features.norm(dim=-1, keepdim=True)
    similarity2 = (100.0 * image_features @ text_features.T).softmax(dim=-1)
    similarity2 = similarity2.squeeze()[0]
    if visualize:
        logging.info(f'CLIP prediction: {[similarity1, similarity2]}')    
    return 0 if similarity1 > similarity2 else 1
 def get_rel_path(rel, verbose=0):
    """get correct relation path, map rel to one of the 37 existing query masks""" 
    rel = RELATION_DICT.get(rel.strip())   # get synonym of relation if no mask exists
    rel = '_'.join(rel.split(' ')) if ' ' in rel else rel
    path = 'relations/' + rel + '.npy'
    if verbose > 0: 
        logging.info('Loading ', path)
    return path
 def use_query_mask(obj_pos, info, rel, linspace, axes, dim, memory, verbose=0, visualize=False):
    """implements query mask usage: load spatial query mask for relation rel, 
    scale query mask to object, encode mask to SSP region and 
    move to object position in SSP memory, extract object proposals in region""" 
    xs, ys, ws, hs = linspace
    x_axis, y_axis, w_axis, h_axis = axes
    x, y, width, height = obj_pos
    # 50 pixels was object size in query mask generation -- use pixel scale values for height and width
    iso_scale = np.mean([(width*info['scales'][1]) / 50, (height*info['scales'][2]) / 50])  
    mask = np.load(get_rel_path(rel, verbose))
    mask = cv2.resize(mask, (100, 100), interpolation = cv2.INTER_AREA)
    # crop new query mask according to scale 
    new_area = int(100 / iso_scale)
    if verbose > 0: 
        print(iso_scale, new_area)
    resized = mask[max(0, 50-new_area): min(50+new_area, 100), max(0, 50-new_area): min(50+new_area, 100)]
    resized = cv2.resize(resized, (100,100), interpolation = cv2.INTER_AREA)
    if visualize: 
        fig, axs = plt.subplots(1,2, sharey=True, layout="constrained", figsize=(6, 3))
        fig.set_tight_layout(True)
        fig.subplots_adjust(top=1.05)
        fig.suptitle('Relation: '+ rel)
        plt.setp(plt.gcf().get_axes(), xticks=[], yticks=[])
        axs[0].imshow(mask, cmap='gray')
        axs[0].title.set_text('original')
        axs[1].imshow(resized, cmap='gray')
        axs[1].title.set_text('resized')
        plt.show()
    # encode mask to SSP region 
    counter = 0 
    vector = spa.SemanticPointer(data=np.zeros(dim))
    for (i, j) in zip(*np.where(resized > 0.05)):
        x, y = xs[i], ys[j]
        vector += encode_point_multidim([y, x, 1, 1], axes=axes)
        counter += 1
    vector.normalize()
    if verbose > 0: 
        logging.info(f'Resized mask encoded {counter} points')
    if visualize:
        plot_heatmap_multidim(vector, xs, ys, VECTORS, vmin=-0.2, vmax=0.2, name=f'Encoded Region')
    # get object info and move query mask to position
    x, y, width, height = obj_pos
    obj_center = (x + width / 2, y + height / 2)
    img_center = np.array([xs[50], ys[50]])
    shift = -img_center + (obj_center[1], obj_center[0])
    encoded_shift = encode_point(shift[1], shift[0], x_axis=x_axis, y_axis=y_axis)
    shifted_pos = vector.convolve(encoded_shift)
    shifted_pos.normalize()
    if visualize:
        plot_heatmap_multidim(shifted_pos, xs, ys, VECTORS, vmin=-0.1, vmax=0.1, name=f'Query Region')
    # query region and compare output to vocab = all saved SSPs
    vocab_vectors = np.zeros((len(info['encoded_ssps']), dim))
    color_lst = []
    for i, (name, ssp) in enumerate(info['encoded_ssps'].items()):
        vocab_vectors[i, :] = ssp.v
        color_lst.append(RGB_COLORS[i])
    similarity = np.tensordot((memory * ~shifted_pos).v, vocab_vectors, axes=([0], [1]))
    d = OrderedDict(zip(list(info['encoded_ssps'].keys()), similarity))
    res = list(OrderedDict(sorted(d.items(), key=lambda x: x[1], reverse=True)))
    proposals = np.array(res)[np.array(np.array(sorted(similarity, reverse=True)) > 0).astype(bool)]
    if verbose > 0: 
        print('Proposals: ', *proposals)
    if visualize: 
        fig = plt.Figure()
        plt.bar(np.arange(len(info['encoded_ssps'])), similarity, color=color_lst, label=info['encoded_items'].keys())
        plt.title('Similarity')
        plt.legend(loc='upper left', bbox_to_anchor=(1.0, 1.05))
        plt.show()
    return proposals
 def select_func(data, img, obj, info, counter, memory, visualize=False):
    """implements select function: probe SSP memory with given object name""" 
    result = None
    if obj + '_' + str(counter) in info['encoded_ssps'].keys():
        obj += '_' + str(counter)
        obj_ssp, obj_pos = select_ssp(obj, memory, info['encoded_ssps'], data.ssp_vectors, data.linspace)
        result = [obj, obj_ssp, obj_pos]
    else: 
        # test synonyms and plural/singular 
        test = [singularize(obj), pluralize(obj)]
        if SYNONYMS.get(obj):
            test += SYNONYMS.get(obj)
        if CLASS_DICT.get(obj):
            test += CLASS_DICT.get(obj)
        for obj in test: 
            obj = str(obj) + '_' + str(counter)
            if obj in info['encoded_ssps'].keys():
                obj_ssp, obj_pos = select_ssp(obj, memory, info['encoded_ssps'], data.ssp_vectors, data.linspace)
                result = [obj, obj_ssp, obj_pos]
    if result is not None and visualize: 
        fig, ax = plt.subplots(1,1)
        plt.axis('off')
        ax.imshow(img)
        rect = patches.Rectangle((obj_pos[0] * info['scales'][0], obj_pos[1] * info['scales'][0]),
                                 obj_pos[2] * info['scales'][1], obj_pos[3] * info['scales'][2],
                                 linewidth = 2,
                                 label = name,
                                 edgecolor = 'c',
                                 facecolor = 'none')
        ax.add_patch(rect)
        plt.show()
    return result   
 def verify_func(data, img, attr, results, info, dim, memory, verbose=0, visualize=False):
    """implements all verify functions dependent on length of attributes given""" 
    if len(attr) == 2: 
        # verify_color, verify_shape, verify_scene 
        num = int(re.findall(r'\d+', attr[0])[0])
        if results[num] is not None: 
            name, obj_ssp, obj_pos = results[num]
            x, y = obj_pos[0] * info['scales'][0], obj_pos[1] * info['scales'][0]
            w, h = obj_pos[2] * info['scales'][1], obj_pos[3] * info['scales'][2]
            clip_tokens = [f'The {name.split("_")[0]} is {attr[1].strip()}', 
                           f'The {name.split("_")[0]} is not {attr[1].strip()}']
            pred = clip_query([x, y, w, h], img, name.split('_')[0], clip_tokens, visualize=visualize)
            return True if pred == 0 else False
        else: 
            return False
    elif len(attr) == 3:
        # verify_rel, verify_rel_inv
        obj, rel, rel_obj = attr
        num = int(re.findall(r'\d+', obj)[0])
        proposals = []
        if results[num] is not None: 
            name, obj_ssp, obj_pos = results[num]
            proposals = use_query_mask(obj_pos, info, rel, data.linspace, data.ssp_axes, dim, memory, visualize=visualize)
            proposals = [str(p).split('_')[0] for p in proposals]
            return True if rel_obj.strip() in proposals else False
        else: 
            return False
    # verify_f
    elif len(attr) == 1: 
        clip_tokens = [f'The image is {attr[0].strip()}', 
                       f'The image is not {attr[0].strip()}',
                       f'The image is a {attr[0].strip()}',
                       f'The image is not a{attr[0].strip()}']
        pred = clip_query_scene(img, clip_tokens, verbose=0)
        return True if pred == 0 or pred == 2 else False
    else:
        logging.warning('verify_func not implemented')
    return -1
 def query_func(func, img, attr, results, info, img_size, dim, verbose=0, visualize=False):
    """implements all query functions""" 
    if 'query_f(' in func: 
        attr_type = attr[0].strip()  
        assert attr_type in CLASS_DICT, f'{attr_type} not found in class dictionary'
        attributes = CLASS_DICT.get(attr_type)
        clip_tokens = [f'This is a {a} {attr_type}' for a in attributes]
        pred = clip_query_scene(img, clip_tokens, verbose=0)
        return attributes[pred]
    num = int(re.findall(r'\d+', attr[0])[0])
    if results[num] is not None: 
        name, obj_ssp, obj_pos = results[num]
        x, y = obj_pos[0] * info['scales'][0], obj_pos[1] * info['scales'][0]
        w, h = obj_pos[2] * info['scales'][1], obj_pos[3] * info['scales'][2]
        if len(attr) == 1:
            if 'query_n' in func: 
                # query name 
                return name.split('_')[0]
            if 'query_h' in func: 
                # query horizontal position --> x-value 
                if (x + w / 2) >= (img_size[1] / 2): 
                    return 'right'
                else:
                    return 'left'
            if 'query_v' in func: 
                # query vertical position --> y-value 
                if (y + h / 2) >= (img_size[0] / 2): 
                    return 'bottom'
                else:
                    return 'top'
        elif len(attr) == 2: 
            attr_type = attr[1].strip()  
            assert attr_type in ATTRIBUTE_DICT, f'{attr_type} not found in attribute dictionary'
            attributes = ATTRIBUTE_DICT.get(attr_type)
            clip_tokens = [f'The {attr_type} of {name.split("_")[0]} is {a}' for a in attributes]
            pred = clip_query([x, y, w, h], img, name.split('_')[0], clip_tokens, visualize=visualize)
            return attributes[pred]
    else:
        return None
    logging.warning('query not implemented', func, attr)
    return -1
 def relate_func(data, func, attr, results, info, dim, memory, visualize=False):
    """implements all relationship functions""" 
    if 'relate_inv_name' in func or 'relate_name' in func:  
        obj, rel, rel_obj = attr
        num = int(re.findall(r'\d+', attr[0])[0])
        if results[num] is not None: 
            name, obj_ssp, obj_pos = results[num]
            proposals = use_query_mask(obj_pos, info, rel, data.linspace, data.ssp_axes, dim,
                                       memory, verbose=0, visualize=visualize)
            selected_obj = proposals[0]
            # use rel_obj to filter proposals
            rel_obj = rel_obj.strip()
            if rel_obj == selected_obj.split('_')[0]:
                if visualize:
                    logging.info('Found perfect match\n')
                return selected_obj
            elif rel_obj in [str(p).split('_')[0] for p in proposals]: 
                idx = [str(p).split('_')[0] for p in proposals].index(rel_obj)
                return proposals[idx]
            elif rel_obj in CLASS_DICT.keys():
                class_lst = CLASS_DICT.get(rel_obj)
                for p in [str(p).split('_')[0] for p in proposals]:
                    if p in class_lst or singularize(p) in class_lst:
                        if visualize:
                            logging.info(f'Found better proposal for {rel_obj}: {p}\n')
                        idx = [str(p).split('_')[0] for p in proposals].index(p)
                        return proposals[idx]
            else:
                if visualize: 
                    logging.info(f'Did not find {rel_obj} in proposals\n')
                return None
    elif 'relate_inv' in func or 'relate(' in func: 
        obj, rel = attr
        num = int(re.findall(r'\d+', attr[0])[0])
        if results[num] is not None: 
            name, obj_ssp, obj_pos = results[num]
            proposals = use_query_mask(obj_pos, info, rel, data.linspace, data.ssp_axes, dim,
                                       memory, visualize=visualize)
            return proposals[0]
    else: 
        logging.warning(f'{func} not implemented')
        return -1
 def filter_func(func, img, attr, img_size, results, info, visualize=False):
    """implements all filter functions""" 
    obj, filter_attr = attr
    num = int(re.findall(r'\d+', attr[0])[0])
    if results[num] is not None: 
        name, obj_ssp, obj_pos = results[num]
        x, y = obj_pos[0] * info['scales'][0], obj_pos[1] * info['scales'][0]
        w, h = obj_pos[2] * info['scales'][1], obj_pos[3] * info['scales'][2]
        # query height --> y-value 
        if 'bottom' in filter_attr or 'top' in filter_attr:
            if (y + h / 2) >= (img_size[0] / 2): 
                pred_attr = 'bottom'
            else:
                pred_attr = 'top'
            return pred_attr == filter_attr.strip()
        # query side --> x-value 
        if 'right' in filter_attr or 'left' in filter_attr:
            if (x + w / 2) >= (img_size[1] / 2): 
                pred_attr = 'right'
            else:
                pred_attr = 'left'
            return pred_attr == filter_attr.strip()
        # filter by attribute: color, shape, activity, material 
        else:
            clip_tokens = [f'The {name.split("_")[0]} is {filter_attr}', 
                           f'The {name.split("_")[0]} is not {filter_attr}']
            pred = clip_query([x, y, w, h], img, name.split('_')[0], clip_tokens, visualize=visualize)
            if 'not' in func: 
                return True if pred == 1 else False
            else:
                return True if pred == 0 else False
    else:
        if visualize: 
            logging.info('No object was found in last step -- nothing to filter')
        return None
    return -1
 def choose_func(data, img, func, attr, img_size, results, info, dim, memory, verbose=0, visualize=False):
    """implements all choose functions""" 
    if 'choose_f' in func: 
        pred = clip_query_scene(img, attr, verbose=0)
        return attr[pred]
    num = int(re.findall(r'\d+', attr[0])[0])
    if results[num] is not None: 
        name, obj_ssp, obj_pos = results[num]
        if 'choose_h' in func or 'choose_v' in func:
            x, y = obj_pos[0] * info['scales'][0], obj_pos[1] * info['scales'][0]
            w, h = obj_pos[2] * info['scales'][1], obj_pos[3] * info['scales'][2]
            # choose side --> x-value 
            if 'choose_h' in func: 
                if (x + w / 2) >= (img_size[1] / 2): 
                    return 'right'
                else:
                    return 'left'
            # choose vertical alignment --> y-value 
            if 'choose_v' in func:
                if (y + h / 2) >= (img_size[0] / 2): 
                    return 'bottom'
                else:
                    return 'top'
        elif 'choose_n' in func: 
            obj, name1, name2 = attr
            if name == name1:
                return name1
            elif name == name2:
                return name2
            else:
                return None
        elif 'choose_attr' in func: 
            obj, attr_type, attr1, attr2 = attr
            x, y = obj_pos[0] * info['scales'][0], obj_pos[1] * info['scales'][0]
            w, h = obj_pos[2] * info['scales'][1], obj_pos[3] * info['scales'][2]
            clip_tokens = [f'The {attr_type} of {name.split("_")[0]} is {attr1}', 
                           f'The {attr_type} of {name.split("_")[0]} is {attr2}']
            pred = clip_query([x, y, w, h], img, name.split('_')[0], clip_tokens, visualize=visualize)
            attr_lst = [attr1.strip(), attr2.strip()]
            if visualize: 
                logging.info(f'Choose attribute: {attr_type} of {name} --> clip prediction: {attr_lst[pred]}')
            return attr_lst[pred]
        elif 'choose_rel_inv' in func: 
            obj, rel_obj, attr1, attr2 = attr
            proposals1 = use_query_mask(obj_pos, info, attr1, data.linspace, data.ssp_axes, dim,
                                        memory, 0, visualize)
            proposals1 = [str(p).split('_')[0] for p in proposals1]
            proposals2 = use_query_mask(obj_pos, info, attr2, data.linspace, data.ssp_axes, dim,
                                        memory, 0, visualize)
            proposals2 = [str(p).split('_')[0] for p in proposals2]
            if rel_obj.strip() in proposals1: 
                return ANSWER_MAP.get(attr1.strip()) if attr1.strip() in ANSWER_MAP else attr1.strip()
            elif rel_obj.strip() in proposals2:
                return ANSWER_MAP.get(attr2.strip()) if attr2.strip() in ANSWER_MAP else attr2.strip()
            else:
                return None
        elif 'choose_subj' in func: 
            subj1, subj2, attribute = attr 
            num1 = int(re.findall(r'\d+', subj1)[0])
            num2 = int(re.findall(r'\d+', subj2)[0])
            if results[num1] is not None and results[num2] is not None: 
                name1, obj_ssp1, obj_pos1 = results[num1]
                name2, obj_ssp2, obj_pos2 = results[num2]
                x1, y1 = obj_pos1[0] * info['scales'][0], obj_pos1[1] * info['scales'][0]
                w1, h1 = obj_pos1[2] * info['scales'][1], obj_pos1[3] * info['scales'][2]
                x2, y2 = obj_pos2[0] * info['scales'][0], obj_pos2[1] * info['scales'][0]
                w2, h2 = obj_pos2[2] * info['scales'][1], obj_pos2[3] * info['scales'][2]
                pred = clip_choose([x1, y1, w1, h1], [x2, y2, w2, h2], img, attribute, visualize=visualize)
                return name1.split('_')[0] if pred == 0 else name2.split('_')[0]
        elif 'choose(' in func: 
            obj, attr1, attr2 = attr
            x, y = obj_pos[0] * info['scales'][0], obj_pos[1] * info['scales'][0]
            w, h = obj_pos[2] * info['scales'][1], obj_pos[3] * info['scales'][2]
            clip_tokens = [f'The {name.split("_")[0]} is {attr1}', 
                           f'The {name.split("_")[0]} is {attr2}']
            pred = clip_query([x, y, w, h], img, name.split('_')[0], clip_tokens, visualize=visualize)
            attr_lst = [attr1.strip(), attr2.strip()]
            if visualize: 
                logging.info(f'Choose {attr1} or {attr2} for {name} --> clip prediction: {attr_lst[pred]}')
            return attr_lst[pred]
        else:
            logging.warning(func, 'not implemented yet')
            return -1
    else:
        if visualize: 
            logging.info('No object was found in last step -- nothing to choose')
        return None
    return -1
 def run_program(data, img, info, counter, memory, dim, verbose=0):
    """ run program for question on given image: 
    for each step in program select appropriate function
    """
    scale, w_scale, h_scale = info['scales']
    img_size = img.shape[:2]
    results = []
    last_step = False
    last_func = None
    for i, step in enumerate(info['program']): 
        if i+1 == len(info['program']): 
            last_step = True
        _, func = step.split('=')
        attr = func.split('(')[-1].split(')')[0].split(',')
        if verbose > 0: 
            logging.info(f'{i+1}. step: \t {func}')
        if 'select' in func: 
            obj = attr[0].strip()
            res = select_func(data, img, obj, info, counter, memory, visualize=VISUALIZE)
            results.append(res)
            if res is None:
                if verbose > 1: 
                    logging.info(f'Could not find {obj}')
        elif 'relate' in func: 
            found_rel_obj = relate_func(data, func, attr, results, info, dim, memory, visualize=VISUALIZE)
            if found_rel_obj is not None:
                assert found_rel_obj in info['encoded_ssps'], f'Result of {func}: {found_rel_obj} is not encoded'
                selected_ssp = info['encoded_ssps'][found_rel_obj]     
                _, selected_pos = select_ssp(found_rel_obj, memory, info['encoded_ssps'], data.ssp_vectors, data.linspace)
                results.append([found_rel_obj, selected_ssp, selected_pos])
                if last_step:
                    return 'yes' 
            else:
                results.append(None)
                if last_step:
                    return 'no'  
        elif 'filter' in func: 
            last_filter = filter_func(func, img, attr, img_size, results, info, visualize=VISUALIZE)
            if last_filter: 
                results.append(results[-1])
            else:
                if results[-1] is None: 
                    results.append(None)
                elif results[-1][0].split("_")[0] + "_" + str(counter+1) in info['encoded_ssps'].keys():
                    counter += 1
                    return None
                else: 
                    last_filter = False
                    results.append(results[-1])
        elif 'verify' in func: 
            pred = verify_func(data, img, attr, results, info, dim, memory, verbose=verbose, visualize=VISUALIZE)
            if 'verify_relation_name' in func or 'verify_relation_inv_name' in func: 
                results.append(results[-1] if pred else None)
            else:
                results.append(pred)
            if last_step:
                return 'yes' if pred else 'no'
        elif 'query' in func: 
            return query_func(func, img, attr, results, info, img_size, dim, verbose=verbose, visualize=VISUALIZE)
        elif 'exist' in func: 
            num = int(re.findall(r'\d+', attr[0])[0])
            if last_step:
                return 'yes' if results[num] is not None else 'no'
            else:
                if results[num] is not None and 'filter' not in last_func: 
                    results.append(True)
                elif results[num] is not None and last_filter:
                    results.append(True)
                else:
                    results.append(False)
        elif 'or(' in func: 
            attr1 = int(re.findall(r'\d+', attr[0])[0])
            attr2 = int(re.findall(r'\d+', attr[1])[0])
            return 'yes' if results[attr1] or results[attr2] else 'no'
        elif 'and(' in func: 
            attr1 = int(re.findall(r'\d+', attr[0])[0])
            attr2 = int(re.findall(r'\d+', attr[1])[0])
            return 'yes' if results[attr1] and results[attr2] else 'no'
        elif 'different' in func: 
            if len(attr) == 1:
                logging.warning(f'{func} cannot be computed')
                return None 
            else:    
                pred_attr1 = query_func(f'query_{attr[2].strip()}', img, [attr[0], attr[2]], results, info, img_size, dim)
                pred_attr2 = query_func(f'query_{attr[2].strip()}', img, [attr[1], attr[2]], results, info, img_size, dim)
                if pred_attr1 != pred_attr2:
                    return 'yes'
                else:
                    return 'no'
        elif 'same' in func: 
            if len(attr) == 1:
                logging.warning(f'{func} cannot be computed')
                return None 
            pred_attr1 = query_func(f'query_{attr[2].strip()}', img, [attr[0], attr[2]], results, info, img_size, dim)
            pred_attr2 = query_func(f'query_{attr[2].strip()}', img, [attr[1], attr[2]], results, info, img_size, dim)
            if pred_attr1 == pred_attr2:
                return 'yes'
            else:
                return 'no'
        elif 'choose' in func: 
            return choose_func(data, img, func, attr, img_size, results, info, dim, memory, visualize=VISUALIZE)
        else: 
            logging.warning(f'{func} not implemented')
            return -1
        last_func = func
 if __name__ == "__main__":
    TEST = True
    DIM = 2048
    RANDOM_SEED = 17
    np.random.seed(RANDOM_SEED)
    torch.manual_seed(RANDOM_SEED)
    random.seed(RANDOM_SEED)
    x = datetime.now()
    TIME_STAMP = x.strftime("%d%b%y-%H%M")
    log_file = f"logs/run{TIME_STAMP}-{'VAL' if TEST else 'TRAIN'}{RANDOM_SEED}.log"
    log_file = f"logs/run{TIME_STAMP}-DIM{DIM}-{RANDOM_SEED}.log"
    logging.basicConfig(level=logging.INFO, filename=log_file, 
                        filemode="w", format="%(asctime)s %(levelname)s %(message)s")
    print('Logging to ', log_file)
    DATA_PATH = '/scratch/penzkofer/GQA'
    CUDA_DEVICE = 7
    torch.cuda.set_device(CUDA_DEVICE)
    device = torch.device("cuda:" + str(CUDA_DEVICE))
    clip_model, preprocess = clip.load("ViT-B/32", device=device)
    with open('gqa_all_relations_map.json') as f: 
        RELATION_DICT = json.load(f)
    with open('gqa_vocab_classes.json') as f: 
        CLASS_DICT = json.load(f)
    with open('gqa_all_attributes.json') as f: 
        ATTRIBUTE_DICT = json.load(f)
    start = time.time()
    res = 100
    dim = DIM
    new_size = (25, 25)  # size should be smaller than resolution!
    xs = np.linspace(0, new_size[1], res)
    ys = np.linspace(0, new_size[0], res)
    ws = np.linspace(1, 10, 10)
    hs = np.linspace(1, 10, 10)
    rng = np.random.RandomState(seed=RANDOM_SEED)
    x_axis = make_good_unitary(dim, rng=rng)
    y_axis = make_good_unitary(dim, rng=rng)
    w_axis = make_good_unitary(dim, rng=rng)
    h_axis = make_good_unitary(dim, rng=rng)
    logging.info(f'Size of vector space: {res**2}x{10**2}x{dim}')
    logging.info(f'x-axis resolution = {len(xs)}, y-axis resolution = {len(ys)}')
    logging.info(f'width resolution = {len(ws)}, height resolution = {len(hs)}')
    # precompute the vectors 
    VECTORS = get_heatmap_vectors_multidim(xs, ys, ws, hs, x_axis, y_axis, w_axis, h_axis)
    logging.info(VECTORS.shape)
    logging.info(f'Took {time.time() - start}seconds to load vectors.\n')
    # load questions, programs and scenegraphs
    if TEST: 
        questions_path = 'val_balanced_questions.json'
        programs_path = 'programs/trainval_balanced_programs.json'
        scene_path = 'val_sceneGraphs.json'
    else: 
        questions_path = 'train_balanced_questions.json'
        programs_path = 'programs/trainval_balanced_programs.json'
        scene_path = 'train_sceneGraphs.json'
    with open(os.path.join(DATA_PATH, questions_path), 'r') as f:
        questions = json.load(f)
    with open(os.path.join(DATA_PATH, programs_path), 'r') as f:
        programs = json.load(f)
    with open(os.path.join(DATA_PATH, scene_path), 'r') as f:
        scenegraphs = json.load(f)
    columns = ['semantic', 'entailed', 'equivalent', 'question', 'imageId', 'isBalanced', 'groups', 
                    'answer', 'semanticStr', 'annotations', 'types', 'fullAnswer']
    questions = pd.DataFrame.from_dict(questions, orient='index', columns=columns)
    questions = questions.reset_index()
    questions = questions.rename(columns={"index": "questionID"}, errors="raise")
    columns = ['imageID', 'question', 'program', 'questionID', 'answer']
    programs = pd.DataFrame(programs, columns=columns)
    DATA = GQADataset(questions, programs, scenegraphs, vectors=VECTORS, 
                    axes=[x_axis, y_axis, w_axis, h_axis], linspace=[xs, ys, ws, hs])
    logging.info(f'Length of data set: {len(DATA)}')
    VISUALIZE = False
    DATA.set_visualize(VISUALIZE)
    DATA.set_verbose(0)
    results_lst = []
    num_correct = 0
    pbar = tqdm(range(len(DATA)), ncols=115)
    for i, IDX in enumerate(pbar):
        start = time.time()
        img, info, memory = DATA.encode_item(IDX, dim=dim)
        avg_mse, avg_iou, correct_items = DATA.decode_item(img, info, memory)
        try: 
            answer = run_program(DATA, img, info, counter=1, memory=memory, dim=dim, verbose=1)
        except Exception as e:
            answer = None
            logging.error(e)
        if answer == -1: 
            logging.warning(f'[{IDX}] not fully implemented!')
        time_in_sec = time.time() - start
        correct = answer == info["answer"]
        num_correct += int(correct)
        results = {'q_id':info['q_id'], 'question':info['question'],'program':info['program'], 'image':info['img_id'],
                'true_answer': info['answer'], 'pred_answer': answer, 'correct': correct, 'time': time_in_sec,
                'enc_avg_mse': avg_mse, 'enc_avg_iou': avg_iou, 'enc_correct_items': correct_items, 
                'enc_items': len(info["encoded_items"]), 'q_idx': IDX}
        results_lst.append(results)
        logging.info(f'[{IDX+1}] {num_correct / (i+1):.2%}')
        pbar.set_postfix({'correct': f'{num_correct / (i+1):.2%}', 'q_idx': str(IDX+1)})
    results_df = pd.DataFrame(results_lst)
    logging.info(f'Acurracy: {results_df.correct.sum() / len(results_df):.2%}')
    out_path = os.path.join(DATA_PATH, f'results-DIM{DIM}-{"VAL" if TEST else "TRAIN"}{RANDOM_SEED}.pkl')
    results_df.to_pickle(out_path)
    logging.info(f'Saved results to {out_path}.')
--- a/utils.py
+++ b/utils.py
@ -0,0 +1,298 @@
 import json
 import numpy as np
 import matplotlib 
 import matplotlib.pyplot as plt 
 import matplotlib.patches as patches
 from pattern.text.en import singularize
 import nengo.spa as spa
 import scipy.integrate as integrate
 RGB_COLORS = []
 hex_colors = ['#8a3ffc', '#ff7eb6', '#6fdc8c', '#d2a106', '#ba4e00', '#33b1ff', '#570408',
              '#fa4d56', '#4589ff', '#08bdba', '#d4bbff', '#007d79', '#d12771', '#bae6ff']
 for h in hex_colors: 
    RGB_COLORS.append(matplotlib.colors.to_rgb(h))
 for i, (name, h) in enumerate(matplotlib.colors.cnames.items()):
    if i > 10: 
        RGB_COLORS.append(matplotlib.colors.to_rgb(h))
 f = open('gqa_all_relations_map.json') 
 RELATION_DICT = json.load(f)
 f.close()
 f = open('gqa_all_vocab_classes.json') 
 CLASS_DICT = json.load(f)
 f.close()
 f = open('gqa_all_attributes.json') 
 ATTRIBUTE_DICT = json.load(f)
 f.close()        
 def bbox_to_mask(x, y, w, h, img_size=(500, 500), name=None, visualize=False):
    img = np.zeros(img_size)
    mask_w = np.ones(w)
    for j in range(y, y+h):
        img[j][x:x+w] = mask_w
    if visualize:   
        fig = plt.figure(figsize=(img_size[0] // 80, img_size[1] // 80))
        plt.imshow(img, cmap='gray')
        if name: 
            plt.title(name)
        plt.axis('off')
        plt.show()
    return img
 def make_good_unitary(D, eps=1e-3, rng=np.random):
    """from https://github.com/ctn-waterloo/cogsci2019-ssp/tree/master"""
    a = rng.rand((D - 1) // 2)
    sign = rng.choice((-1, +1), len(a))
    phi = sign * np.pi * (eps + a * (1 - 2 * eps))
    assert np.all(np.abs(phi) >= np.pi * eps)
    assert np.all(np.abs(phi) <= np.pi * (1 - eps))
    fv = np.zeros(D, dtype='complex64')
    fv[0] = 1
    fv[1:(D + 1) // 2] = np.cos(phi) + 1j * np.sin(phi)
    fv[-1:D // 2:-1] = np.conj(fv[1:(D + 1) // 2])
    if D % 2 == 0:
        fv[D // 2] = 1
    assert np.allclose(np.abs(fv), 1)
    v = np.fft.ifft(fv)
    # assert np.allclose(v.imag, 0, atol=1e-5)
    v = v.real
    assert np.allclose(np.fft.fft(v), fv)
    assert np.allclose(np.linalg.norm(v), 1)
    return spa.SemanticPointer(v)
 def get_heatmap_vectors(xs, ys, x_axis_sp, y_axis_sp):
    """from https://github.com/ctn-waterloo/cogsci2019-ssp/tree/master:
    Precompute spatial semantic pointers for every location in the linspace
    Used to quickly compute heat maps by a simple vectorized dot product (matrix multiplication)
    """
    if x_axis_sp.__class__.__name__ == 'SemanticPointer':
        dim = len(x_axis_sp.v)
    else:
        dim = len(x_axis_sp)
        x_axis_sp = spa.SemanticPointer(data=x_axis_sp)
        y_axis_sp = spa.SemanticPointer(data=y_axis_sp)
    vectors = np.zeros((len(xs), len(ys), dim))
    for i, x in enumerate(xs):
        for j, y in enumerate(ys):
            p = encode_point(
                x=x, y=y, x_axis=x_axis_sp, y_axis=y_axis_sp,
            )
            vectors[i, j, :] = p.v
    return vectors
 def power(s, e):
    """from https://github.com/ctn-waterloo/cogsci2019-ssp/tree/master"""
    x = np.fft.ifft(np.fft.fft(s.v) ** e).real
    return spa.SemanticPointer(data=x)
 def encode_point(x, y, x_axis, y_axis):
    """from https://github.com/ctn-waterloo/cogsci2019-ssp/tree/master"""
    return power(x_axis, x) * power(y_axis, y)
 def encode_region(x, y, x_axis, y_axis):
    """from https://github.com/ctn-waterloo/cogsci2019-ssp/tree/master"""
    print(integrate.quad(power(x_axis, x) * power(y_axis, y), x, x+28))
    return integrate.quad(power(x_axis, x) * power(y_axis, y), x, x+28)
 def plot_heatmap(img, img_area, encoded_pos, xs, ys, heatmap_vectors, name='', vmin=-1, vmax=1, invert=False):
    """from https://github.com/ctn-waterloo/cogsci2019-ssp/tree/master"""
    assert encoded_pos.__class__.__name__ == 'SemanticPointer'
    # sp has shape (dim,) and heatmap_vectors have shape (xs, ys, dim) so the result will be (xs, ys)
    vec_sim = np.tensordot(encoded_pos.v, heatmap_vectors, axes=([0], [2]))
    num_plots = 3 if img_area is not None else 2
    fig, axs = plt.subplots(1, num_plots, figsize=(4 * num_plots + 3, 3)) 
    fig.suptitle(name)
    axs[0].imshow(img)
    axs[0].axis('off')
    if img_area is not None: 
        axs[1].imshow(img_area, cmap='gray')
        axs[1].set_xticks(np.arange(0, len(xs), 20), np.arange(0, img.shape[1], img.shape[1] / len(xs)).astype(int)[::20])
        axs[1].set_yticks(np.arange(0, len(ys), 10), np.arange(0, img.shape[0], img.shape[0] / len(ys)).astype(int)[::10])
        axs[1].axis('off')
        im = axs[2].imshow(np.transpose(vec_sim), origin='upper', interpolation='none', extent=(xs[-1], xs[0],  ys[-1], ys[0]), vmin=vmin, vmax=vmax, cmap='plasma')
        axs[2].axis('off')
    else:
        im = axs[1].imshow(np.transpose(vec_sim), origin='upper', interpolation='none', extent=(xs[-1], xs[0],  ys[-1], ys[0]), vmin=vmin, vmax=vmax, cmap='plasma')
        axs[1].axis('off')
    fig.colorbar(im, ax=axs.ravel().tolist())
    plt.show()
 def generate_region_vector(desired, xs, ys, x_axis_sp, y_axis_sp):
    """from https://github.com/ctn-waterloo/cogsci2019-ssp/tree/master"""
    vector = np.zeros_like((x_axis_sp.v))
    for i, x in enumerate(xs):
        for j, y in enumerate(ys):
            if desired[j, i] == 1:
                vector += encode_point(x, y, x_axis_sp, y_axis_sp).v
    sp = spa.SemanticPointer(data=vector)
    sp.normalize()
    return sp
 def bb_intersection_over_union(boxA, boxB):
    """from https://pyimagesearch.com/2016/11/07/intersection-over-union-iou-for-object-detection/"""
    # determine the (x, y)-coordinates of the intersection rectangle
    xA = max(boxA[0], boxB[0])
    yA = max(boxA[1], boxB[1])
    xB = min(boxA[2], boxB[2])
    yB = min(boxA[3], boxB[3])
    # compute the area of intersection rectangle
    interArea = abs(max((xB - xA, 0)) * max((yB - yA), 0))
    if interArea == 0:
        return 0
    # compute the area of both the prediction and ground-truth
    # rectangles
    boxAArea = abs((boxA[2] - boxA[0]) * (boxA[3] - boxA[1]))
    boxBArea = abs((boxB[2] - boxB[0]) * (boxB[3] - boxB[1]))
    # compute the intersection over union by taking the intersection
    # area and dividing it by the sum of prediction + ground-truth
    # areas - the interesection area
    iou = interArea / float(boxAArea + boxBArea - interArea)
    # return the intersection over union value
    return iou
 def encode_point_multidim(values, axes):
    """ power(x_axis, x) * power(y_axis, y) for variable dimensions """ 
    assert len(values) == len(axes), f'number of values {len(values)} does not match number of axes {len(axes)}'
    res = 1 
    for v, a in zip(values, axes): 
        res *= power(a, v)
    return res
 def get_heatmap_vectors_multidim(xs, ys, ws, hs, x_axis, y_axis, w_axis, h_axis):
    """ adaptation of get_heatmap_vectors for 4 dimensions """
    assert x_axis.__class__.__name__ == 'SemanticPointer', f'Axes need to be of type SemanticPointer but are {x_axis.__class__.__name__}'
    dim = len(x_axis.v)
    vectors = np.zeros((len(xs), len(ys), len(ws), len(hs), dim))
    for i, x in enumerate(xs):
        for j, y in enumerate(ys):
            for n, w in enumerate(ws):
                for k, h in enumerate(hs):
                    p = encode_point_multidim(values=[x, y, w, h], axes=[x_axis, y_axis, w_axis, h_axis])
                    vectors[i, j, n, k, :] = p.v
    return vectors
 def ssp_to_loc_multidim(sp, heatmap_vectors, linspace):
    """ adaptation of loc_match from https://github.com/ctn-waterloo/cogsci2019-ssp/tree/master
    Convert an SSP to the approximate 4-dim location that it represents.
    Uses the heatmap vectors as a lookup table
    """
    xs, ys, ws, hs = linspace
    assert sp.__class__.__name__ == 'SemanticPointer', \
    f'Queried object needs to be of type SemanticPointer but is {sp.__class__.__name__}'
    # axes: a list of axes to be summed over, first sequence applying to first tensor, second to second tensor
    vs = np.tensordot(sp.v, heatmap_vectors, axes=([0], [4]))
    res = np.unravel_index(np.argmax(vs, axis=None), vs.shape)
    x = xs[res[0]]
    y = ys[res[1]]
    w = ws[res[2]]
    h = hs[res[3]]
    return np.array([x, y, w, h])
 def encode_image_ssp(img, sg_data, axes, new_size, dim, visualize=True):
    """encode all objects in an image to an SSP memory"""
    img_size = img.shape[:2]
    if img_size[1] / 2 < img_size[0]:
        scale = img_size[0] / new_size[0]
    else:
        scale = img_size[1] / new_size[1]
    # scale width and height to fixed size of 10 
    w_scale = img_size[1] / 10
    h_scale = img_size[0] / 10 
    if visualize: 
        print(f'Original image {img_size[1]}x{img_size[0]} --> {np.array(img_size) / scale}')
        fig, ax = plt.subplots(1,1)
        ax.imshow(img, interpolation='none', origin='upper', extent=[0, img_size[1] / scale, img_size[0] / scale, 0])
        plt.axis('off')
    encoded_items = {}
    encoded_ssps = {}
    memory = spa.SemanticPointer(data=np.zeros(dim)) 
    name_lst = []
    for i, obj in enumerate(sg_data.items()):
        id_num, obj_dict = obj
        name = obj_dict.get('name')
        name = singularize(name)
        name_lst.append(name)
        name += '_' + str(name_lst.count(name))
        # extract ground truth data and scale to fit to SSPs 
        x, y, width, height  = obj_dict.get('x'), obj_dict.get('y'), obj_dict.get('w'), obj_dict.get('h')
        x, y, width, height  = x / scale, y / scale, width / w_scale, height / h_scale
        width = width if width >= 1 else 1
        height = height if height >= 1 else 1
        # Round values to next int (otherwise decoding gets buggy)
        item = np.round([x, y, width, height], decimals=0).astype(int)
        encoded_items[name] = item
        #print(name, item)
        pos = encode_point_multidim(list(item), axes)
        ssp = spa.SemanticPointer(dim)
        encoded_ssps[name] = ssp 
        memory += ssp * pos 
        if visualize: 
            x, y, width, height = item
            width, height = (width * w_scale) / scale, (height * h_scale) / scale 
            rect = patches.Rectangle((x, y),
                                     width, height,
                                     linewidth = 2,
                                     label = name,
                                     edgecolor = 'c',
                                     facecolor = 'none')
            ax.add_patch(rect)
    if visualize: 
        plt.show()
    return encoded_items, encoded_ssps, memory