VDGR/utils/image_features_reader.py

from typing import List
import csv
import h5py
import numpy as np
import copy
import pickle
import lmdb # install lmdb by "pip install lmdb"
import base64
import pdb
import os


class ImageFeaturesH5Reader(object):
    """
    A reader for H5 files containing pre-extracted image features. A typical
    H5 file is expected to have a column named "image_id", and another column
    named "features".

    Example of an H5 file:
    ```
    faster_rcnn_bottomup_features.h5
       |--- "image_id" [shape: (num_images, )]
       |--- "features" [shape: (num_images, num_proposals, feature_size)]
       +--- .attrs ("split", "train")
    ```
    Parameters
    ----------
    features_h5path : str
        Path to an H5 file containing COCO train / val image features.
    in_memory : bool
        Whether to load the whole H5 file in memory. Beware, these files are
        sometimes tens of GBs in size. Set this to true if you have sufficient
        RAM - trade-off between speed and memory.
    """
    def __init__(self, features_path: str, scene_graph_path: str, in_memory: bool = False):
        self.features_path = features_path
        self.scene_graph_path = scene_graph_path
        self._in_memory = in_memory

        self.env = lmdb.open(self.features_path, max_readers=1, readonly=True,
                            lock=False, readahead=False, meminit=False)

        with self.env.begin(write=False) as txn:
            self._image_ids = pickle.loads(txn.get('keys'.encode()))

        self.features = [None] * len(self._image_ids)
        self.num_boxes = [None] * len(self._image_ids)
        self.boxes = [None] * len(self._image_ids)
        self.boxes_ori = [None] * len(self._image_ids)
        self.cls_prob = [None] * len(self._image_ids)
        self.edge_indexes = [None] * len(self._image_ids)
        self.edge_attributes = [None] * len(self._image_ids)

    def __len__(self):
        return len(self._image_ids)

    def __getitem__(self, image_id):
       
        image_id = str(image_id).encode()
        index = self._image_ids.index(image_id)
        if self._in_memory:
            # Load features during first epoch, all not loaded together as it
            # has a slow start.
            if self.features[index] is not None:
                features = self.features[index]
                num_boxes = self.num_boxes[index]
                image_location = self.boxes[index]
                image_location_ori = self.boxes_ori[index]
                cls_prob = self.cls_prob[index]
                edge_indexes = self.edge_indexes[index]
                edge_attributes = self.edge_attributes[index]
            else:
                with self.env.begin(write=False) as txn:
                    item = pickle.loads(txn.get(image_id))
                    image_id = item['image_id']
                    image_h = int(item['image_h'])
                    image_w = int(item['image_w'])
                    num_boxes = int(item['num_boxes'])
                    features = np.frombuffer(base64.b64decode(item["features"]), dtype=np.float32).reshape(num_boxes, 2048)
                    boxes = np.frombuffer(base64.b64decode(item['boxes']), dtype=np.float32).reshape(num_boxes, 4)

                    cls_prob = np.frombuffer(base64.b64decode(item['cls_prob']), dtype=np.float32).reshape(num_boxes, 1601)
                    # add an extra row at the top for the <IMG> tokens
                    g_cls_prob = np.zeros(1601, dtype=np.float32)
                    g_cls_prob[0] = 1
                    cls_prob = np.concatenate([np.expand_dims(g_cls_prob,axis=0), cls_prob], axis=0)

                    self.cls_prob[index] = cls_prob

                    g_feat = np.sum(features, axis=0) / num_boxes
                    num_boxes = num_boxes + 1
                    
                    features = np.concatenate([np.expand_dims(g_feat, axis=0), features], axis=0)
                    self.features[index] = features

                    image_location = np.zeros((boxes.shape[0], 5), dtype=np.float32)
                    image_location[:,:4] = boxes
                    image_location[:,4] = (image_location[:,3] - image_location[:,1]) * (image_location[:,2] - image_location[:,0]) / (float(image_w) * float(image_h))

                    image_location_ori = copy.deepcopy(image_location)

                    image_location[:,0] = image_location[:,0] / float(image_w)
                    image_location[:,1] = image_location[:,1] / float(image_h)
                    image_location[:,2] = image_location[:,2] / float(image_w)
                    image_location[:,3] = image_location[:,3] / float(image_h)

                    g_location = np.array([0,0,1,1,1])
                    image_location = np.concatenate([np.expand_dims(g_location, axis=0), image_location], axis=0)
                    self.boxes[index] = image_location

                    g_location_ori = np.array([0, 0, image_w, image_h, image_w*image_h])
                    image_location_ori = np.concatenate([np.expand_dims(g_location_ori, axis=0), image_location_ori], axis=0)
                    self.boxes_ori[index] = image_location_ori
                    self.num_boxes[index] = num_boxes

                    # load the scene graph data
                    pth = os.path.join(self.scene_graph_path, f'{image_id}.pkl')
                    with open(pth, 'rb') as f:
                        graph_data = pickle.load(f)
                    edge_indexes = []
                    edge_attributes = []
                    for e_idx, e_attr in graph_data:
                        edge_indexes.append(e_idx)
                        # get one-hot-encoding of the edges
                        e_attr_one_hot = np.zeros((12,), dtype=np.float32)  # 12 = 11 rels + hub-node rel
                        e_attr_one_hot[e_attr] = 1.0
                        edge_attributes.append(e_attr_one_hot)
                    edge_indexes = np.array(edge_indexes, dtype=np.float64).transpose(1, 0)
                    edge_attributes = np.stack(edge_attributes, axis=0)

                    self.edge_indexes[index] = edge_indexes
                    self.edge_attributes[index] = edge_attributes

        else:
            # Read chunk from file everytime if not loaded in memory.
            with self.env.begin(write=False) as txn:
                item = pickle.loads(txn.get(image_id))
                image_id = item['image_id']
                image_h = int(item['image_h'])
                image_w = int(item['image_w'])
                num_boxes = int(item['num_boxes'])
                cls_prob = np.frombuffer(base64.b64decode(item['cls_prob']), dtype=np.float32).reshape(num_boxes, 1601)
                # add an extra row at the top for the <IMG> tokens
                g_cls_prob = np.zeros(1601, dtype=np.float32)
                g_cls_prob[0] = 1
                cls_prob = np.concatenate([np.expand_dims(g_cls_prob,axis=0), cls_prob], axis=0)

                features = np.frombuffer(base64.b64decode(item["features"]), dtype=np.float32).reshape(num_boxes, 2048)
                boxes = np.frombuffer(base64.b64decode(item['boxes']), dtype=np.float32).reshape(num_boxes, 4)
                g_feat = np.sum(features, axis=0) / num_boxes
                num_boxes = num_boxes + 1
                features = np.concatenate([np.expand_dims(g_feat, axis=0), features], axis=0)
                
                image_location = np.zeros((boxes.shape[0], 5), dtype=np.float32)
                image_location[:,:4] = boxes
                image_location[:,4] = (image_location[:,3] - image_location[:,1]) * (image_location[:,2] - image_location[:,0]) / (float(image_w) * float(image_h))

                image_location_ori = copy.deepcopy(image_location)
                image_location[:,0] = image_location[:,0] / float(image_w)
                image_location[:,1] = image_location[:,1] / float(image_h)
                image_location[:,2] = image_location[:,2] / float(image_w)
                image_location[:,3] = image_location[:,3] / float(image_h)

                g_location = np.array([0,0,1,1,1])
                image_location = np.concatenate([np.expand_dims(g_location, axis=0), image_location], axis=0)

                g_location_ori = np.array([0,0,image_w,image_h,image_w*image_h])
                image_location_ori = np.concatenate([np.expand_dims(g_location_ori, axis=0), image_location_ori], axis=0)

                # load the scene graph data
                pth = os.path.join(self.scene_graph_path, f'{image_id}.pkl')
                with open(pth, 'rb') as f:
                    graph_data = pickle.load(f)
                edge_indexes = []
                edge_attributes = []
                for e_idx, e_attr in graph_data:
                    edge_indexes.append(e_idx)
                    # get one-hot-encoding of the edges
                    e_attr_one_hot = np.zeros((12,), dtype=np.float32)  # 12 = 11 rels + hub-node rel
                    e_attr_one_hot[e_attr] = 1.0
                    edge_attributes.append(e_attr_one_hot)
                edge_indexes = np.array(edge_indexes, dtype=np.float64).transpose(1, 0)
                edge_attributes = np.stack(edge_attributes, axis=0)

        return features, num_boxes, image_location, image_location_ori, cls_prob, edge_indexes, edge_attributes


    def keys(self) -> List[int]:
        return self._image_ids

    def set_keys(self, new_ids: List[str]):
        self._image_ids = list(map(lambda _id: _id.encode('ascii') ,new_ids))
Code release 2023-10-25 15:38:09 +02:00			`from typing import List`
			`import csv`
			`import h5py`
			`import numpy as np`
			`import copy`
			`import pickle`
			`import lmdb # install lmdb by "pip install lmdb"`
			`import base64`
			`import pdb`
			`import os`


			`class ImageFeaturesH5Reader(object):`
			`"""`
			`A reader for H5 files containing pre-extracted image features. A typical`
			`H5 file is expected to have a column named "image_id", and another column`
			`named "features".`

			`Example of an H5 file:`
			```
			`faster_rcnn_bottomup_features.h5`
			`\|--- "image_id" [shape: (num_images, )]`
			`\|--- "features" [shape: (num_images, num_proposals, feature_size)]`
			`+--- .attrs ("split", "train")`
			```
			`Parameters`
			`----------`
			`features_h5path : str`
			`Path to an H5 file containing COCO train / val image features.`
			`in_memory : bool`
			`Whether to load the whole H5 file in memory. Beware, these files are`
			`sometimes tens of GBs in size. Set this to true if you have sufficient`
			`RAM - trade-off between speed and memory.`
			`"""`
			`def __init__(self, features_path: str, scene_graph_path: str, in_memory: bool = False):`
			`self.features_path = features_path`
			`self.scene_graph_path = scene_graph_path`
			`self._in_memory = in_memory`

			`self.env = lmdb.open(self.features_path, max_readers=1, readonly=True,`
			`lock=False, readahead=False, meminit=False)`

			`with self.env.begin(write=False) as txn:`
			`self._image_ids = pickle.loads(txn.get('keys'.encode()))`

			`self.features = [None] * len(self._image_ids)`
			`self.num_boxes = [None] * len(self._image_ids)`
			`self.boxes = [None] * len(self._image_ids)`
			`self.boxes_ori = [None] * len(self._image_ids)`
			`self.cls_prob = [None] * len(self._image_ids)`
			`self.edge_indexes = [None] * len(self._image_ids)`
			`self.edge_attributes = [None] * len(self._image_ids)`

			`def __len__(self):`
			`return len(self._image_ids)`

			`def __getitem__(self, image_id):`

			`image_id = str(image_id).encode()`
			`index = self._image_ids.index(image_id)`
			`if self._in_memory:`
			`# Load features during first epoch, all not loaded together as it`
			`# has a slow start.`
			`if self.features[index] is not None:`
			`features = self.features[index]`
			`num_boxes = self.num_boxes[index]`
			`image_location = self.boxes[index]`
			`image_location_ori = self.boxes_ori[index]`
			`cls_prob = self.cls_prob[index]`
			`edge_indexes = self.edge_indexes[index]`
			`edge_attributes = self.edge_attributes[index]`
			`else:`
			`with self.env.begin(write=False) as txn:`
			`item = pickle.loads(txn.get(image_id))`
			`image_id = item['image_id']`
			`image_h = int(item['image_h'])`
			`image_w = int(item['image_w'])`
			`num_boxes = int(item['num_boxes'])`
			`features = np.frombuffer(base64.b64decode(item["features"]), dtype=np.float32).reshape(num_boxes, 2048)`
			`boxes = np.frombuffer(base64.b64decode(item['boxes']), dtype=np.float32).reshape(num_boxes, 4)`

			`cls_prob = np.frombuffer(base64.b64decode(item['cls_prob']), dtype=np.float32).reshape(num_boxes, 1601)`
			`# add an extra row at the top for the <IMG> tokens`
			`g_cls_prob = np.zeros(1601, dtype=np.float32)`
			`g_cls_prob[0] = 1`
			`cls_prob = np.concatenate([np.expand_dims(g_cls_prob,axis=0), cls_prob], axis=0)`

			`self.cls_prob[index] = cls_prob`

			`g_feat = np.sum(features, axis=0) / num_boxes`
			`num_boxes = num_boxes + 1`

			`features = np.concatenate([np.expand_dims(g_feat, axis=0), features], axis=0)`
			`self.features[index] = features`

			`image_location = np.zeros((boxes.shape[0], 5), dtype=np.float32)`
			`image_location[:,:4] = boxes`
			`image_location[:,4] = (image_location[:,3] - image_location[:,1]) * (image_location[:,2] - image_location[:,0]) / (float(image_w) * float(image_h))`

			`image_location_ori = copy.deepcopy(image_location)`

			`image_location[:,0] = image_location[:,0] / float(image_w)`
			`image_location[:,1] = image_location[:,1] / float(image_h)`
			`image_location[:,2] = image_location[:,2] / float(image_w)`
			`image_location[:,3] = image_location[:,3] / float(image_h)`

			`g_location = np.array([0,0,1,1,1])`
			`image_location = np.concatenate([np.expand_dims(g_location, axis=0), image_location], axis=0)`
			`self.boxes[index] = image_location`

			`g_location_ori = np.array([0, 0, image_w, image_h, image_w*image_h])`
			`image_location_ori = np.concatenate([np.expand_dims(g_location_ori, axis=0), image_location_ori], axis=0)`
			`self.boxes_ori[index] = image_location_ori`
			`self.num_boxes[index] = num_boxes`

			`# load the scene graph data`
			`pth = os.path.join(self.scene_graph_path, f'{image_id}.pkl')`
			`with open(pth, 'rb') as f:`
			`graph_data = pickle.load(f)`
			`edge_indexes = []`
			`edge_attributes = []`
			`for e_idx, e_attr in graph_data:`
			`edge_indexes.append(e_idx)`
			`# get one-hot-encoding of the edges`
			`e_attr_one_hot = np.zeros((12,), dtype=np.float32) # 12 = 11 rels + hub-node rel`
			`e_attr_one_hot[e_attr] = 1.0`
			`edge_attributes.append(e_attr_one_hot)`
			`edge_indexes = np.array(edge_indexes, dtype=np.float64).transpose(1, 0)`
			`edge_attributes = np.stack(edge_attributes, axis=0)`

			`self.edge_indexes[index] = edge_indexes`
			`self.edge_attributes[index] = edge_attributes`

			`else:`
			`# Read chunk from file everytime if not loaded in memory.`
			`with self.env.begin(write=False) as txn:`
			`item = pickle.loads(txn.get(image_id))`
			`image_id = item['image_id']`
			`image_h = int(item['image_h'])`
			`image_w = int(item['image_w'])`
			`num_boxes = int(item['num_boxes'])`
			`cls_prob = np.frombuffer(base64.b64decode(item['cls_prob']), dtype=np.float32).reshape(num_boxes, 1601)`
			`# add an extra row at the top for the <IMG> tokens`
			`g_cls_prob = np.zeros(1601, dtype=np.float32)`
			`g_cls_prob[0] = 1`
			`cls_prob = np.concatenate([np.expand_dims(g_cls_prob,axis=0), cls_prob], axis=0)`

			`features = np.frombuffer(base64.b64decode(item["features"]), dtype=np.float32).reshape(num_boxes, 2048)`
			`boxes = np.frombuffer(base64.b64decode(item['boxes']), dtype=np.float32).reshape(num_boxes, 4)`
			`g_feat = np.sum(features, axis=0) / num_boxes`
			`num_boxes = num_boxes + 1`
			`features = np.concatenate([np.expand_dims(g_feat, axis=0), features], axis=0)`

			`image_location = np.zeros((boxes.shape[0], 5), dtype=np.float32)`
			`image_location[:,:4] = boxes`
			`image_location[:,4] = (image_location[:,3] - image_location[:,1]) * (image_location[:,2] - image_location[:,0]) / (float(image_w) * float(image_h))`

			`image_location_ori = copy.deepcopy(image_location)`
			`image_location[:,0] = image_location[:,0] / float(image_w)`
			`image_location[:,1] = image_location[:,1] / float(image_h)`
			`image_location[:,2] = image_location[:,2] / float(image_w)`
			`image_location[:,3] = image_location[:,3] / float(image_h)`

			`g_location = np.array([0,0,1,1,1])`
			`image_location = np.concatenate([np.expand_dims(g_location, axis=0), image_location], axis=0)`

			`g_location_ori = np.array([0,0,image_w,image_h,image_w*image_h])`
			`image_location_ori = np.concatenate([np.expand_dims(g_location_ori, axis=0), image_location_ori], axis=0)`

			`# load the scene graph data`
			`pth = os.path.join(self.scene_graph_path, f'{image_id}.pkl')`
			`with open(pth, 'rb') as f:`
			`graph_data = pickle.load(f)`
			`edge_indexes = []`
			`edge_attributes = []`
			`for e_idx, e_attr in graph_data:`
			`edge_indexes.append(e_idx)`
			`# get one-hot-encoding of the edges`
			`e_attr_one_hot = np.zeros((12,), dtype=np.float32) # 12 = 11 rels + hub-node rel`
			`e_attr_one_hot[e_attr] = 1.0`
			`edge_attributes.append(e_attr_one_hot)`
			`edge_indexes = np.array(edge_indexes, dtype=np.float64).transpose(1, 0)`
			`edge_attributes = np.stack(edge_attributes, axis=0)`

			`return features, num_boxes, image_location, image_location_ori, cls_prob, edge_indexes, edge_attributes`


			`def keys(self) -> List[int]:`
			`return self._image_ids`

			`def set_keys(self, new_ids: List[str]):`
			`self._image_ids = list(map(lambda _id: _id.encode('ascii') ,new_ids))`