193 lines
9.3 KiB
Python
193 lines
9.3 KiB
Python
|
from typing import List
|
||
|
import csv
|
||
|
import h5py
|
||
|
import numpy as np
|
||
|
import copy
|
||
|
import pickle
|
||
|
import lmdb # install lmdb by "pip install lmdb"
|
||
|
import base64
|
||
|
import pdb
|
||
|
import os
|
||
|
|
||
|
|
||
|
class ImageFeaturesH5Reader(object):
|
||
|
"""
|
||
|
A reader for H5 files containing pre-extracted image features. A typical
|
||
|
H5 file is expected to have a column named "image_id", and another column
|
||
|
named "features".
|
||
|
|
||
|
Example of an H5 file:
|
||
|
```
|
||
|
faster_rcnn_bottomup_features.h5
|
||
|
|--- "image_id" [shape: (num_images, )]
|
||
|
|--- "features" [shape: (num_images, num_proposals, feature_size)]
|
||
|
+--- .attrs ("split", "train")
|
||
|
```
|
||
|
Parameters
|
||
|
----------
|
||
|
features_h5path : str
|
||
|
Path to an H5 file containing COCO train / val image features.
|
||
|
in_memory : bool
|
||
|
Whether to load the whole H5 file in memory. Beware, these files are
|
||
|
sometimes tens of GBs in size. Set this to true if you have sufficient
|
||
|
RAM - trade-off between speed and memory.
|
||
|
"""
|
||
|
def __init__(self, features_path: str, scene_graph_path: str, in_memory: bool = False):
|
||
|
self.features_path = features_path
|
||
|
self.scene_graph_path = scene_graph_path
|
||
|
self._in_memory = in_memory
|
||
|
|
||
|
self.env = lmdb.open(self.features_path, max_readers=1, readonly=True,
|
||
|
lock=False, readahead=False, meminit=False)
|
||
|
|
||
|
with self.env.begin(write=False) as txn:
|
||
|
self._image_ids = pickle.loads(txn.get('keys'.encode()))
|
||
|
|
||
|
self.features = [None] * len(self._image_ids)
|
||
|
self.num_boxes = [None] * len(self._image_ids)
|
||
|
self.boxes = [None] * len(self._image_ids)
|
||
|
self.boxes_ori = [None] * len(self._image_ids)
|
||
|
self.cls_prob = [None] * len(self._image_ids)
|
||
|
self.edge_indexes = [None] * len(self._image_ids)
|
||
|
self.edge_attributes = [None] * len(self._image_ids)
|
||
|
|
||
|
def __len__(self):
|
||
|
return len(self._image_ids)
|
||
|
|
||
|
def __getitem__(self, image_id):
|
||
|
|
||
|
image_id = str(image_id).encode()
|
||
|
index = self._image_ids.index(image_id)
|
||
|
if self._in_memory:
|
||
|
# Load features during first epoch, all not loaded together as it
|
||
|
# has a slow start.
|
||
|
if self.features[index] is not None:
|
||
|
features = self.features[index]
|
||
|
num_boxes = self.num_boxes[index]
|
||
|
image_location = self.boxes[index]
|
||
|
image_location_ori = self.boxes_ori[index]
|
||
|
cls_prob = self.cls_prob[index]
|
||
|
edge_indexes = self.edge_indexes[index]
|
||
|
edge_attributes = self.edge_attributes[index]
|
||
|
else:
|
||
|
with self.env.begin(write=False) as txn:
|
||
|
item = pickle.loads(txn.get(image_id))
|
||
|
image_id = item['image_id']
|
||
|
image_h = int(item['image_h'])
|
||
|
image_w = int(item['image_w'])
|
||
|
num_boxes = int(item['num_boxes'])
|
||
|
features = np.frombuffer(base64.b64decode(item["features"]), dtype=np.float32).reshape(num_boxes, 2048)
|
||
|
boxes = np.frombuffer(base64.b64decode(item['boxes']), dtype=np.float32).reshape(num_boxes, 4)
|
||
|
|
||
|
cls_prob = np.frombuffer(base64.b64decode(item['cls_prob']), dtype=np.float32).reshape(num_boxes, 1601)
|
||
|
# add an extra row at the top for the <IMG> tokens
|
||
|
g_cls_prob = np.zeros(1601, dtype=np.float32)
|
||
|
g_cls_prob[0] = 1
|
||
|
cls_prob = np.concatenate([np.expand_dims(g_cls_prob,axis=0), cls_prob], axis=0)
|
||
|
|
||
|
self.cls_prob[index] = cls_prob
|
||
|
|
||
|
g_feat = np.sum(features, axis=0) / num_boxes
|
||
|
num_boxes = num_boxes + 1
|
||
|
|
||
|
features = np.concatenate([np.expand_dims(g_feat, axis=0), features], axis=0)
|
||
|
self.features[index] = features
|
||
|
|
||
|
image_location = np.zeros((boxes.shape[0], 5), dtype=np.float32)
|
||
|
image_location[:,:4] = boxes
|
||
|
image_location[:,4] = (image_location[:,3] - image_location[:,1]) * (image_location[:,2] - image_location[:,0]) / (float(image_w) * float(image_h))
|
||
|
|
||
|
image_location_ori = copy.deepcopy(image_location)
|
||
|
|
||
|
image_location[:,0] = image_location[:,0] / float(image_w)
|
||
|
image_location[:,1] = image_location[:,1] / float(image_h)
|
||
|
image_location[:,2] = image_location[:,2] / float(image_w)
|
||
|
image_location[:,3] = image_location[:,3] / float(image_h)
|
||
|
|
||
|
g_location = np.array([0,0,1,1,1])
|
||
|
image_location = np.concatenate([np.expand_dims(g_location, axis=0), image_location], axis=0)
|
||
|
self.boxes[index] = image_location
|
||
|
|
||
|
g_location_ori = np.array([0, 0, image_w, image_h, image_w*image_h])
|
||
|
image_location_ori = np.concatenate([np.expand_dims(g_location_ori, axis=0), image_location_ori], axis=0)
|
||
|
self.boxes_ori[index] = image_location_ori
|
||
|
self.num_boxes[index] = num_boxes
|
||
|
|
||
|
# load the scene graph data
|
||
|
pth = os.path.join(self.scene_graph_path, f'{image_id}.pkl')
|
||
|
with open(pth, 'rb') as f:
|
||
|
graph_data = pickle.load(f)
|
||
|
edge_indexes = []
|
||
|
edge_attributes = []
|
||
|
for e_idx, e_attr in graph_data:
|
||
|
edge_indexes.append(e_idx)
|
||
|
# get one-hot-encoding of the edges
|
||
|
e_attr_one_hot = np.zeros((12,), dtype=np.float32) # 12 = 11 rels + hub-node rel
|
||
|
e_attr_one_hot[e_attr] = 1.0
|
||
|
edge_attributes.append(e_attr_one_hot)
|
||
|
edge_indexes = np.array(edge_indexes, dtype=np.float64).transpose(1, 0)
|
||
|
edge_attributes = np.stack(edge_attributes, axis=0)
|
||
|
|
||
|
self.edge_indexes[index] = edge_indexes
|
||
|
self.edge_attributes[index] = edge_attributes
|
||
|
|
||
|
else:
|
||
|
# Read chunk from file everytime if not loaded in memory.
|
||
|
with self.env.begin(write=False) as txn:
|
||
|
item = pickle.loads(txn.get(image_id))
|
||
|
image_id = item['image_id']
|
||
|
image_h = int(item['image_h'])
|
||
|
image_w = int(item['image_w'])
|
||
|
num_boxes = int(item['num_boxes'])
|
||
|
cls_prob = np.frombuffer(base64.b64decode(item['cls_prob']), dtype=np.float32).reshape(num_boxes, 1601)
|
||
|
# add an extra row at the top for the <IMG> tokens
|
||
|
g_cls_prob = np.zeros(1601, dtype=np.float32)
|
||
|
g_cls_prob[0] = 1
|
||
|
cls_prob = np.concatenate([np.expand_dims(g_cls_prob,axis=0), cls_prob], axis=0)
|
||
|
|
||
|
features = np.frombuffer(base64.b64decode(item["features"]), dtype=np.float32).reshape(num_boxes, 2048)
|
||
|
boxes = np.frombuffer(base64.b64decode(item['boxes']), dtype=np.float32).reshape(num_boxes, 4)
|
||
|
g_feat = np.sum(features, axis=0) / num_boxes
|
||
|
num_boxes = num_boxes + 1
|
||
|
features = np.concatenate([np.expand_dims(g_feat, axis=0), features], axis=0)
|
||
|
|
||
|
image_location = np.zeros((boxes.shape[0], 5), dtype=np.float32)
|
||
|
image_location[:,:4] = boxes
|
||
|
image_location[:,4] = (image_location[:,3] - image_location[:,1]) * (image_location[:,2] - image_location[:,0]) / (float(image_w) * float(image_h))
|
||
|
|
||
|
image_location_ori = copy.deepcopy(image_location)
|
||
|
image_location[:,0] = image_location[:,0] / float(image_w)
|
||
|
image_location[:,1] = image_location[:,1] / float(image_h)
|
||
|
image_location[:,2] = image_location[:,2] / float(image_w)
|
||
|
image_location[:,3] = image_location[:,3] / float(image_h)
|
||
|
|
||
|
g_location = np.array([0,0,1,1,1])
|
||
|
image_location = np.concatenate([np.expand_dims(g_location, axis=0), image_location], axis=0)
|
||
|
|
||
|
g_location_ori = np.array([0,0,image_w,image_h,image_w*image_h])
|
||
|
image_location_ori = np.concatenate([np.expand_dims(g_location_ori, axis=0), image_location_ori], axis=0)
|
||
|
|
||
|
# load the scene graph data
|
||
|
pth = os.path.join(self.scene_graph_path, f'{image_id}.pkl')
|
||
|
with open(pth, 'rb') as f:
|
||
|
graph_data = pickle.load(f)
|
||
|
edge_indexes = []
|
||
|
edge_attributes = []
|
||
|
for e_idx, e_attr in graph_data:
|
||
|
edge_indexes.append(e_idx)
|
||
|
# get one-hot-encoding of the edges
|
||
|
e_attr_one_hot = np.zeros((12,), dtype=np.float32) # 12 = 11 rels + hub-node rel
|
||
|
e_attr_one_hot[e_attr] = 1.0
|
||
|
edge_attributes.append(e_attr_one_hot)
|
||
|
edge_indexes = np.array(edge_indexes, dtype=np.float64).transpose(1, 0)
|
||
|
edge_attributes = np.stack(edge_attributes, axis=0)
|
||
|
|
||
|
return features, num_boxes, image_location, image_location_ori, cls_prob, edge_indexes, edge_attributes
|
||
|
|
||
|
|
||
|
def keys(self) -> List[int]:
|
||
|
return self._image_ids
|
||
|
|
||
|
def set_keys(self, new_ids: List[str]):
|
||
|
self._image_ids = list(map(lambda _id: _id.encode('ascii') ,new_ids))
|