HOIGaze/adt_processing/adt_preprocessing.py

import numpy as np
import os
os.nice(5)
import sys
from pathlib import Path
import matplotlib.pyplot as plt
import matplotlib.patches as patches
import plotly.graph_objects as go
import math
from math import tan
import random
from scipy.linalg import pinv
import projectaria_tools.core.mps as mps
import shutil
import json
from PIL import Image
from utils import remake_dir
import pandas as pd
import pylab as p
from IPython.display import display
import time


from projectaria_tools import utils
from projectaria_tools.core.stream_id import StreamId
from projectaria_tools.core import calibration
from projectaria_tools.projects.adt import (
   AriaDigitalTwinDataProvider,
   AriaDigitalTwinSkeletonProvider,
   AriaDigitalTwinDataPathsProvider,
   bbox3d_to_line_coordinates,
   bbox2d_to_image_coordinates,
   utils as adt_utils,
   Aria3dPose
)


dataset_path = '/datasets/public/zhiming_datasets/adt/'
dataset_processed_path = '/scratch/hu/pose_forecast/adt_hoigaze/'

remake_dir(dataset_processed_path)
remake_dir(dataset_processed_path + "train/")
remake_dir(dataset_processed_path + "test/")
dataset_info = pd.read_csv('adt.csv')
object_num = 5 # number of extracted dynamic objects that are closest to the left or right hands


for i, seq in enumerate(dataset_info['sequence_name']):
    action = dataset_info['action'][i]
    print("\nprocessing {}th seq: {}, action: {}...".format(i+1, seq, action))
    seq_path = dataset_path + seq + '/'
    if dataset_info['training'][i] == 1:
        save_path = dataset_processed_path + 'train/' + seq + '_'
    if dataset_info['training'][i] == 0:
        save_path = dataset_processed_path + 'test/' + seq + '_'

    paths_provider = AriaDigitalTwinDataPathsProvider(seq_path)
    all_device_serials = paths_provider.get_device_serial_numbers()
    selected_device_number = 0
    data_paths = paths_provider.get_datapaths_by_device_num(selected_device_number)
    print("loading ground truth data...")
    gt_provider = AriaDigitalTwinDataProvider(data_paths)
    print("loading ground truth data done")

    stream_id = StreamId("214-1")
    img_timestamps_ns = gt_provider.get_aria_device_capture_timestamps_ns(stream_id)
    frame_num = len(img_timestamps_ns)
    print("There are {} frames".format(frame_num))

    # get all available skeletons in a sequence
    skeleton_ids = gt_provider.get_skeleton_ids()
    skeleton_info = gt_provider.get_instance_info_by_id(skeleton_ids[0])
    print("skeleton ", skeleton_info.name, " wears ", skeleton_info.associated_device_serial)

    useful_frames = []
    gaze_data = np.zeros((frame_num, 6)) # gaze_direction (3) + gaze_2d (2) + frame_id (1)
    head_data = np.zeros((frame_num, 6)) # head_direction (3) + head_translation (3)
    hand_data = np.zeros((frame_num, 6)) # left_hand_translation (3) + right_hand_translation (3)
    hand_joint_data = np.zeros((frame_num, 92)) # left_hand (15*3) + right_hand (15*3) + attended_hand_gt + attended_hand_baseline (closest_hand)
    object_all_data = []
    object_bbx_all_data = []
    object_center_all_data = []

    local_time = time.asctime(time.localtime(time.time()))
    print('\nProcessing starts at ' + local_time)
    for j in range(frame_num):
        timestamps_ns = img_timestamps_ns[j]

        skeleton_with_dt = gt_provider.get_skeleton_by_timestamp_ns(timestamps_ns, skeleton_ids[0])
        assert skeleton_with_dt.is_valid(), "skeleton is not valid"

        skeleton = skeleton_with_dt.data()
        head_translation_id = [4]
        hand_translation_id = [8, 27]
        hand_joints_id = [9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42]
        hand_translation = np.array(skeleton.joints)[hand_translation_id, :].reshape(2*3)
        head_translation = np.array(skeleton.joints)[head_translation_id, :].reshape(1*3)
        hand_joints = np.array(skeleton.joints)[hand_joints_id, :].reshape(30*3)
        hand_data[j] = hand_translation
        hand_joint_data[j, :90] = hand_joints
        left_hand_joints = hand_joints[:45].reshape(15, 3)
        left_hand_center = np.mean(left_hand_joints, axis=0)
        right_hand_joints = hand_joints[45:].reshape(15, 3)
        right_hand_center = np.mean(right_hand_joints, axis=0)

        # get the Aria pose
        aria3dpose_with_dt = gt_provider.get_aria_3d_pose_by_timestamp_ns(timestamps_ns)
        if not aria3dpose_with_dt.is_valid():
            print("aria 3d pose is not available")
        aria3dpose = aria3dpose_with_dt.data()
        transform_scene_device = aria3dpose.transform_scene_device.matrix()

        # get projection function
        cam_calibration = gt_provider.get_aria_camera_calibration(stream_id)
        assert cam_calibration is not None, "no camera calibration"

        eye_gaze_with_dt = gt_provider.get_eyegaze_by_timestamp_ns(timestamps_ns)
        assert eye_gaze_with_dt.is_valid(), "Eye gaze not available"

        # Project the gaze center in CPF frame into camera sensor plane, with multiplication performed in homogenous coordinates
        eye_gaze = eye_gaze_with_dt.data()
        gaze_center_in_cpf = np.array([tan(eye_gaze.yaw), tan(eye_gaze.pitch), 1.0], dtype=np.float64) * eye_gaze.depth
        head_center_in_cpf = np.array([0.0, 0.0, 1.0], dtype=np.float64)
        transform_cpf_sensor = gt_provider.raw_data_provider_ptr().get_device_calibration().get_transform_cpf_sensor(cam_calibration.get_label())
        gaze_center_in_camera = transform_cpf_sensor.inverse().matrix() @ np.hstack((gaze_center_in_cpf, 1)).T
        gaze_center_in_camera = gaze_center_in_camera[:3] / gaze_center_in_camera[3:]
        gaze_center_in_pixels = cam_calibration.project(gaze_center_in_camera)
        head_center_in_camera = transform_cpf_sensor.inverse().matrix() @ np.hstack((head_center_in_cpf, 0)).T
        head_center_in_camera = head_center_in_camera[:3]

        extrinsic_matrix = cam_calibration.get_transform_device_camera().matrix()
        gaze_center_in_device = (extrinsic_matrix @ np.hstack((gaze_center_in_camera, 1)))[0:3]
        gaze_center_in_scene = (transform_scene_device @ np.hstack((gaze_center_in_device, 1)))[0:3]
        head_center_in_device = (extrinsic_matrix @ np.hstack((head_center_in_camera, 0)))[0:3]
        head_center_in_scene = (transform_scene_device @ np.hstack((head_center_in_device, 0)))[0:3]

        gaze_direction = gaze_center_in_scene - head_translation
        if np.linalg.norm(gaze_direction) == 0: # invalid data that will be filtered
            gaze_direction = np.array([0.0, 0.0, 1.0], dtype=np.float64)
        else:
            gaze_direction = [x / np.linalg.norm(gaze_direction) for x in gaze_direction]
        head_direction = head_center_in_scene
        head_direction = [x / np.linalg.norm(head_direction) for x in head_direction]
        head_data[j, 0:3] = head_direction
        head_data[j, 3:6] = head_translation

        left_hand_direction = left_hand_center - head_translation
        left_hand_direction = np.array([x / np.linalg.norm(left_hand_direction) for x in left_hand_direction])
        left_hand_distance_to_gaze = np.arccos(np.sum(gaze_direction*left_hand_direction))
        right_hand_direction = right_hand_center - head_translation
        right_hand_direction = np.array([x / np.linalg.norm(right_hand_direction) for x in right_hand_direction])
        right_hand_distance_to_gaze = np.arccos(np.sum(gaze_direction*right_hand_direction))
        if left_hand_distance_to_gaze < right_hand_distance_to_gaze:
            hand_joint_data[j, 90:91] = 0
        else:
            hand_joint_data[j, 90:91] = 1

        if gaze_center_in_pixels is not None:
            x_pixel = gaze_center_in_pixels[1]
            y_pixel = gaze_center_in_pixels[0]
            gaze_center_in_pixels[0] = x_pixel
            gaze_center_in_pixels[1] = y_pixel

            useful_frames.append(j)
            gaze_2d = np.divide(gaze_center_in_pixels, cam_calibration.get_image_size())

            gaze_data[j, 0:3] = gaze_direction
            gaze_data[j, 3:5] = gaze_2d
            gaze_data[j, 5:6] = j

        # get the objects
        bbox3d_with_dt = gt_provider.get_object_3d_boundingboxes_by_timestamp_ns(timestamps_ns)
        assert bbox3d_with_dt.is_valid(), "3D bounding box is not available"
        bbox3d_all = bbox3d_with_dt.data()

        object_all = []
        object_bbx_all = []
        object_center_all = []

        for obj_id in bbox3d_all:
            bbox3d = bbox3d_all[obj_id]
            aabb = bbox3d.aabb
            aabb_coords = bbox3d_to_line_coordinates(aabb)
            obb = np.zeros(shape=(len(aabb_coords), 3))
            for k in range(0, len(aabb_coords)):
                aabb_pt = aabb_coords[k]
                aabb_pt_homo = np.append(aabb_pt, [1])
                obb_pt = (bbox3d.transform_scene_object.matrix() @ aabb_pt_homo)[0:3]
                obb[k] = obb_pt
            motion_type = gt_provider.get_instance_info_by_id(obj_id).motion_type
            if(str(motion_type) == 'MotionType.DYNAMIC'):
                object_all.append(obb)
                bbx_idx = [0, 1, 2, 3, 5, 6, 7, 8]
                obb_bbx = obb[bbx_idx, :]
                object_bbx_all.append(obb_bbx)
                obb_center = np.mean(obb_bbx, axis=0)
                object_center_all.append(obb_center)

        object_all_data.append(object_all)
        object_bbx_all_data.append(object_bbx_all)
        object_center_all_data.append(object_center_all)

    gaze_data = gaze_data[useful_frames, :] # useful_frames are actually continuous
    head_data = head_data[useful_frames, :]
    hand_data = hand_data[useful_frames, :]
    hand_joint_data = hand_joint_data[useful_frames, :]

    object_all_data = np.array(object_all_data)
    object_all_data = object_all_data[useful_frames, :, :, :]
    #print("Objects shape: {}".format(object_all_data.shape))
    object_bbx_all_data = np.array(object_bbx_all_data)
    object_bbx_all_data = object_bbx_all_data[useful_frames, :, :, :]
    object_center_all_data = np.array(object_center_all_data)
    object_center_all_data = object_center_all_data[useful_frames, :, :]

    # extract the closest objects to the left or right hands
    useful_frames_num = len(useful_frames)
    print("There are {} useful frames".format(useful_frames_num))
    object_num_all = object_all_data.shape[1]
    object_left_hand_data = np.zeros((useful_frames_num, object_num, 16, 3))
    object_bbx_left_hand_data = np.zeros((useful_frames_num, object_num, 8, 3))
    object_distance_to_left_hand = np.zeros((useful_frames_num, object_num_all))
    object_right_hand_data = np.zeros((useful_frames_num, object_num, 16, 3))
    object_bbx_right_hand_data = np.zeros((useful_frames_num, object_num, 8, 3))
    object_distance_to_right_hand = np.zeros((useful_frames_num, object_num_all))

    for j in range(useful_frames_num):
        left_hand_joints = hand_joint_data[j, :45].reshape(15, 3)
        right_hand_joints = hand_joint_data[j, 45:90].reshape(15, 3)
        for k in range(object_num_all):
            object_pos = object_center_all_data[j, k, :]
            object_distance_to_left_hand[j, k] = np.mean(np.linalg.norm(left_hand_joints-object_pos, axis=1))
            object_distance_to_right_hand[j, k] = np.mean(np.linalg.norm(right_hand_joints-object_pos, axis=1))

    for j in range(useful_frames_num):
        distance_to_left_hand = object_distance_to_left_hand[j, :]
        distance_to_left_hand_min = np.min(distance_to_left_hand)
        distance_to_right_hand = object_distance_to_right_hand[j, :]
        distance_to_right_hand_min = np.min(distance_to_right_hand)
        if distance_to_left_hand_min < distance_to_right_hand_min:
            hand_joint_data[j, 91:92] = 0
        else:
            hand_joint_data[j, 91:92] = 1

        left_hand_index = np.argsort(distance_to_left_hand)
        right_hand_index = np.argsort(distance_to_right_hand)
        for k in range(object_num):
            object_left_hand_data[j, k] = object_all_data[j, left_hand_index[k]]
            object_bbx_left_hand_data[j, k] = object_bbx_all_data[j, left_hand_index[k]]
            object_right_hand_data[j, k] = object_all_data[j, right_hand_index[k]]
            object_bbx_right_hand_data[j, k] = object_bbx_all_data[j, right_hand_index[k]]

    gaze_path = save_path + 'gaze.npy'
    head_path = save_path + 'head.npy'
    hand_path = save_path + 'hand.npy'
    hand_joint_path = save_path + 'handjoints.npy'
    object_left_hand_path = save_path + 'object_left.npy'
    object_bbx_left_hand_path = save_path + 'object_bbxleft.npy'
    object_right_hand_path = save_path + 'object_right.npy'
    object_bbx_right_hand_path = save_path + 'object_bbxright.npy'

    np.save(gaze_path, gaze_data)
    np.save(head_path, head_data)
    np.save(hand_path, hand_data)
    np.save(hand_joint_path, hand_joint_data)
    np.save(object_left_hand_path, object_left_hand_data)
    np.save(object_bbx_left_hand_path, object_bbx_left_hand_data)
    np.save(object_right_hand_path, object_right_hand_data)
    np.save(object_bbx_right_hand_path, object_bbx_right_hand_data)

    local_time = time.asctime(time.localtime(time.time()))
    print('\nProcessing ends at ' + local_time)