conan/processing/process_RTGene.py

364 lines
16 KiB
Python
Raw Permalink Normal View History

2021-10-17 21:32:30 +02:00
import tensorflow as tf
import sys
import os
import argparse
import cv2
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.patches as patches
from tqdm.notebook import tqdm
from rt_gene.gaze_tools import get_phi_theta_from_euler, limit_yaw
from rt_gene.extract_landmarks_method_base import LandmarkMethodBase
from rt_gene.estimate_gaze_base import GazeEstimatorBase
from rt_gene.estimate_gaze_tensorflow import GazeEstimator
from rt_gene.gaze_tools_standalone import euler_from_matrix
import itertools
import pandas as pd
#os.environ["CUDA_VISIBLE_DEVICES"]="1"
def getCenter(box):
return np.array([box[2]+box[0], box[3]+box[1]])/2
def load_camera_calibration(calibration_file):
fileType = calibration_file.split(".")[-1]
if fileType == "pkl":
import pickle
infile = open(calibration_file,'rb')
data = pickle.load(infile)
return data["distortion_coef"], data["camera_matrix"]
elif fileType == "yaml":
import yaml
with open(calibration_file, 'r') as f:
cal = yaml.safe_load(f)
dist_coefficients = np.array(cal['distortion_coefficients']['data'], dtype='float32').reshape(1, 5)
camera_matrix = np.array(cal['camera_matrix']['data'], dtype='float32').reshape(3, 3)
return dist_coefficients, camera_matrix
def extract_eye_image_patches(subjects, landmark_estimator):
for subject in subjects:
le_c, re_c, _, _ = subject.get_eye_image_from_landmarks(subject, landmark_estimator.eye_image_size)
subject.left_eye_color = le_c
subject.right_eye_color = re_c
def estimate_gaze(base_name, color_img, landmark_estimator, gaze_estimator, dist_coefficients, camera_matrix, args):
faceboxes = landmark_estimator.get_face_bb(color_img)
if len(faceboxes) == 0:
tqdm.write('Could not find faces in the image')
return
subjects = landmark_estimator.get_subjects_from_faceboxes(color_img, faceboxes)
extract_eye_image_patches(subjects, landmark_estimator)
input_r_list = []
input_l_list = []
input_head_list = []
valid_subject_list = []
roll_pitch_yaw_list = []
for idx, subject in enumerate(subjects):
if subject.left_eye_color is None or subject.right_eye_color is None:
#tqdm.write('Failed to extract eye image patches')
continue
success, rotation_vector, _ = cv2.solvePnP(landmark_estimator.model_points,
subject.landmarks.reshape(len(subject.landmarks), 1, 2),
cameraMatrix=camera_matrix,
distCoeffs=dist_coefficients, flags=cv2.SOLVEPNP_DLS)
if not success:
tqdm.write('Not able to extract head pose for subject {}'.format(idx))
continue
_rotation_matrix, _ = cv2.Rodrigues(rotation_vector)
_rotation_matrix = np.matmul(_rotation_matrix, np.array([[0, 1, 0], [0, 0, -1], [-1, 0, 0]]))
_m = np.zeros((4, 4))
_m[:3, :3] = _rotation_matrix
_m[3, 3] = 1
# Go from camera space to ROS space
_camera_to_ros = [[0.0, 0.0, 1.0, 0.0],
[-1.0, 0.0, 0.0, 0.0],
[0.0, -1.0, 0.0, 0.0],
[0.0, 0.0, 0.0, 1.0]]
roll_pitch_yaw = list(euler_from_matrix(np.dot(_camera_to_ros, _m)))
roll_pitch_yaw = limit_yaw(roll_pitch_yaw)
roll_pitch_yaw_list.append(roll_pitch_yaw)
phi_head, theta_head = get_phi_theta_from_euler(roll_pitch_yaw)
face_image_resized = cv2.resize(subject.face_color, dsize=(224, 224), interpolation=cv2.INTER_CUBIC)
head_pose_image = landmark_estimator.visualize_headpose_result(face_image_resized, (phi_head, theta_head))
if args['vis_headpose']:
plt.axis("off")
plt.imshow(cv2.cvtColor(head_pose_image, cv2.COLOR_BGR2RGB))
plt.show()
if args['save_headpose']:
cv2.imwrite(os.path.join(args['output_path'], os.path.splitext(base_name)[0] + '_headpose.jpg'), head_pose_image)
input_r_list.append(gaze_estimator.input_from_image(subject.right_eye_color))
input_l_list.append(gaze_estimator.input_from_image(subject.left_eye_color))
input_head_list.append([theta_head, phi_head])
valid_subject_list.append(idx)
if len(valid_subject_list) == 0:
return
gaze_est = gaze_estimator.estimate_gaze_twoeyes(inference_input_left_list=input_l_list,
inference_input_right_list=input_r_list,
inference_headpose_list=input_head_list)
file_base = os.path.splitext(base_name)[0]
file = "_".join(file_base.split("_")[:-1])
frame = int(file_base.split("_")[-1])
ret = []
for subject_id, gaze, headpose, roll_pitch_yaw in zip(valid_subject_list, gaze_est.tolist(), input_head_list, roll_pitch_yaw_list):
subject = subjects[subject_id]
#print(roll_pitch_yaw)
# Build visualizations
r_gaze_img = gaze_estimator.visualize_eye_result(subject.right_eye_color, gaze)
l_gaze_img = gaze_estimator.visualize_eye_result(subject.left_eye_color, gaze)
s_gaze_img = np.concatenate((r_gaze_img, l_gaze_img), axis=1)
if args['vis_gaze']:
plt.axis("off")
plt.imshow(cv2.cvtColor(s_gaze_img, cv2.COLOR_BGR2RGB))
plt.show()
if args['save_gaze']:
cv2.imwrite(os.path.join(args['output_path'], os.path.splitext(base_name)[0] + '_gaze.jpg'), s_gaze_img)
# cv2.imwrite(os.path.join(args.output_path, os.path.splitext(base_name)[0] + '_left.jpg'), subject.left_eye_color)
# cv2.imwrite(os.path.join(args.output_path, os.path.splitext(base_name)[0] + '_right.jpg'), subject.right_eye_color)
if args['save_estimate']:
with open(os.path.join(args['output_path'], os.path.splitext(base_name)[0] + '_output.txt'), 'w+') as f:
f.write(os.path.splitext(base_name)[0] + ', [' + str(headpose[1]) + ', ' + str(headpose[0]) + ']' +
', [' + str(gaze[1]) + ', ' + str(gaze[0]) + ']' + '\n')
# Phi: pos - look down, neg - look up
# Theta: pos - rotate left, neg - rotate right
d = {"File":file, "Frame": frame, "SubjectId":subject_id, "HeadBox":subject.box, "Landmarks": subject.landmarks, "GazeTheta":gaze[0], "GazePhi":gaze[1], "HeadPoseTheta":headpose[0], "HeadPosePhi":headpose[1], "HeadPoseRoll":roll_pitch_yaw[0], "HeadPosePitch":roll_pitch_yaw[1], "HeadPoseYaw":roll_pitch_yaw[2]}
ret.append(d)
return ret
def visualize(df, FRAMES):
path_list = [f for f in os.listdir(FRAMES) if '.jpg' in f]
path_list.sort()
image = cv2.imread(os.path.join(FRAMES, path_list[0]))
image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
fig, ax = plt.subplots(1, figsize=(18,10))
for i in range(len(df.SubjectId.unique())):
bbox = df.loc[(df.Frame == 0) & (df.SubjectId == i)]['HeadBox'].values
print(bbox)
if not np.any(pd.isna(bbox)) and len(bbox) > 0:
bbox = np.array(bbox[0])
rect = patches.Rectangle((bbox[0],bbox[1]),bbox[2]-bbox[0],bbox[3]-bbox[1],linewidth=1,edgecolor='c',facecolor='none')
plt.text(bbox[0], bbox[1], 'ID%i' % i, color='c' ,fontsize=20)
ax.add_patch(rect)
ax.imshow(image)
plt.show()
def visualize_sorting(df_sorted):
subs = sorted(df_sorted[~df_sorted.PId.isna()].PId.unique())
for sid in subs:
x = df_sorted[df_sorted.PId==sid].HeadCenter.apply(lambda x: x[0])
y = df_sorted[df_sorted.PId==sid].HeadCenter.apply(lambda x: x[1])
frames = df_sorted[df_sorted.PId==sid].Frame.to_list()
plt.scatter(frames, x, alpha=.2, label = "Sub %i" % sid)
plt.legend()
plt.show()
def process(file, maxPeople, cameraRes = [5760, 2880]):
VIDEO = file
VIDEOOUT = VIDEO.split("/")[-1].split(".")[0]
ROOT = "/".join(VIDEO.split("/")[:-1]) + "/"
TMP_DIR = "/".join(VIDEO.split("/")[:-2]) + "/temp/"
FRAMES = "%s%s_frames" % (TMP_DIR, VIDEOOUT)
if not os.path.exists(VIDEO):
print('WARNING: Could not find video file')
return
script_path = "./"
args = {}
args["calib_file"] = "./calib_insta.pkl"
args["vis_headpose"] = False # store_false
args["save_headpose"] = False # store_false
args["vis_gaze"] = False # store_false
args["save_gaze"] = False # store_false
args["save_estimate"] = False # store_false
args["device_id_facedetection"] = "cuda:0" # store_false
args["im_path"] = os.path.join(script_path, './samples_gaze/')
args["output_path"] = os.path.join(script_path, './samples_gaze/')
args["models"] = [os.path.join(script_path, '../model_nets/Model_allsubjects1.h5')]
args['gaze_backend'] = 'tensorflow'
tqdm.write('Loading networks')
landmark_estimator = LandmarkMethodBase(device_id_facedetection=args["device_id_facedetection"],
checkpoint_path_face=os.path.join(script_path,
"../model_nets/SFD/s3fd_facedetector.pth"),
checkpoint_path_landmark=os.path.join(script_path,
"../model_nets/phase1_wpdc_vdc.pth.tar"),
model_points_file=os.path.join(script_path,
"../model_nets/face_model_68.txt"))
#gaze_estimator = GazeEstimator("/gpu:0", args['models'])
if args['gaze_backend'] == "tensorflow":
from rt_gene.estimate_gaze_tensorflow import GazeEstimator
gaze_estimator = GazeEstimator("/gpu:0", args['models'])
elif args['gaze_backend'] == "pytorch":
from rt_gene.estimate_gaze_pytorch import GazeEstimator
gaze_estimator = GazeEstimator("cuda:0", args['models'])
else:
raise ValueError("Incorrect gaze_base backend, choices are: tensorflow or pytorch")
if not os.path.isdir(args["output_path"]):
os.makedirs(args["output_path"])
video = cv2.VideoCapture(VIDEO)
print('Video frame count: ', video.get(cv2.CAP_PROP_FRAME_COUNT))
if args["calib_file"] is not None and os.path.exists(args["calib_file"]):
_dist_coefficients, _camera_matrix = load_camera_calibration(args["calib_file"])
else:
im_width = video.get(cv2.CAP_PROP_FRAME_WIDTH)
im_height = video.get(cv2.CAP_PROP_FRAME_HEIGHT)
print('WARNING!!! You should provide the camera calibration file, otherwise you might get bad results. \n\
Using a crude approximation!')
_dist_coefficients, _camera_matrix = np.zeros((1, 5)), np.array(
[[im_height, 0.0, im_width / 2.0], [0.0, im_height, im_height / 2.0], [0.0, 0.0, 1.0]])
lstRet = []
for i in tqdm(list(range(int(video.get(cv2.CAP_PROP_FRAME_COUNT))))):
image_file_name = "%s_%i.XXX" % (os.path.splitext(VIDEO)[0], i)
ret, image = video.read()
image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
lstRet.append(estimate_gaze(image_file_name, image, landmark_estimator, gaze_estimator, _dist_coefficients, _camera_matrix, args))
lst = list(itertools.chain.from_iterable(lstRet))
df = pd.DataFrame(lst)
df["HeadCenter"] = df.HeadBox.apply(lambda x: getCenter(x))
df["Phi"] = df.GazePhi + df.HeadPosePhi # gaze yaw
df["Theta"] = df.GazeTheta + df.HeadPoseTheta # gaze pitch
df['Yaw'] = df.GazePhi + df.HeadPoseYaw
df['Pitch'] = df.GazeTheta + df.HeadPosePitch
# path = '%s%s_raw.pkl' % (TMP_DIR, VIDEOOUT)
# df.to_pickle(path)
# print('Saved raw detections to: ', path)
visualize(df, FRAMES)
# Sort ID detections
###############################################################################################################
# Find first frame where all are detected
for frame in sorted(df.Frame.unique()):
frame_df = df.loc[df.Frame == frame]
if len(frame_df['SubjectId'].unique()) == maxPeople:
first_frame = frame
print('First frame where all are detected: ', first_frame)
break
empty_rows = pd.DataFrame()
empty_rows['Frame'] = np.zeros(maxPeople).astype(int)
for col in df.columns:
if not col == 'Frame':
empty_rows[col] = df.loc[df.Frame == first_frame, [col]].values
df = df.loc[df.Frame != 0]
df = df.append(empty_rows).sort_values(by=['Frame'])
df.head()
df_sorted = df.copy()
df_sorted["PId"] = None
df_sorted.loc[df_sorted.Frame == 0, "PId"] = list(range(maxPeople))
df_sorted = df_sorted.sort_values("Frame")
df_sorted.index = list(range(len(df_sorted)))
for frameId in tqdm(sorted(df_sorted.Frame.unique())[1:]):
pidAssignement = []
for frameIdBefore in range(frameId - 1, -1, -1):
allFramesBefore = df_sorted[(df_sorted.Frame == frameIdBefore) & (~df_sorted.PId.isna())]
if (np.array_equal(sorted(allFramesBefore.PId.to_list()), np.arange(maxPeople))):
dfFramesCurrent = df_sorted[df_sorted.Frame == frameId]
for indexCurrentFrame, frameCurrent in dfFramesCurrent.iterrows():
lst = []
for indexBeforeFrame, frameBefore in allFramesBefore.iterrows():
if (frameBefore.HeadCenter[0] > frameCurrent.HeadCenter[0]):
p1 = np.array(frameCurrent.HeadCenter)
p2 = np.array(frameBefore.HeadCenter)
else:
p1 = np.array(frameBefore.HeadCenter)
p2 = np.array(frameCurrent.HeadCenter)
v1 = p1 - p2
dist1 = np.linalg.norm(v1)
p1[0] = p1[0] + cameraRes[0]
v2 = p1 - p2
dist2 = np.linalg.norm(v2)
dist = min([dist1, dist2])
lst.append([dist, frameCurrent.name, indexBeforeFrame, frameBefore])
lst.sort(key=lambda x: x[0])
pidAssignement.append([indexCurrentFrame, lst[0][-1].PId])
break
for index, pid in pidAssignement:
df_sorted.loc[df_sorted.index == index, "PId"] = pid
visualize_sorting(df_sorted)
del df_sorted["SubjectId"]
# Rearrange DataFrame: each ID has specific columns
###############################################################################################################
df_sorted = df_sorted[~df_sorted.PId.isna()].drop_duplicates(subset=['Frame', 'PId'])
FACE_COUNT = len(df_sorted[~df_sorted.PId.isna()].PId.unique())
df2 = df_sorted.pivot(index='Frame', columns="PId",
values=["Landmarks", "GazeTheta", "GazePhi", "HeadCenter", "HeadPoseTheta", "HeadPosePhi",
"HeadPoseYaw", "HeadPosePitch", "HeadPoseRoll", "Phi", "Theta"])
lst = []
for label in ["Landmarks", "GazeTheta", "GazePhi", "Head", "HeadPoseTheta", "HeadPosePhi", "HeadPoseYaw",
"HeadPosePitch", "HeadPoseRoll", "Phi", "Theta"]:
for head_id in range(FACE_COUNT):
lst.append("ID%i_%s" % (head_id, label))
df2.columns = lst
df2 = df2.reset_index()
path = "%s%s_RTGene.pkl" % (TMP_DIR, VIDEOOUT)
df2.to_pickle(path)
print("Saved RT-Gene detections to %s" % path)