Int-HRL/Preprocess_AtariHEAD.ipynb

5.6 KiB

In [ ]:
import os
import numpy as np 
import cv2 
import pandas as pd
import matplotlib.pyplot as plt 
import subprocess
import warnings

import dataset_utils as utils

Read all Montezuma's Revenge trials

In [ ]:
DATA_PATH = 'montezuma_revenge/'

df = pd.read_csv(os.path.join(DATA_PATH, 'meta_data.csv'))
df = df.loc[df.GameName.str.contains('montezuma_revenge')]
df.sort_values(by=['trial_id'], inplace=True, ascending=True)
df.reset_index(drop=True, inplace=True)
df.head()

Get folder names of each trial

In [ ]:
file_lst = [os.path.join(root, name) for root, dirs, files in os.walk(DATA_PATH) for name in files if 'tar.bz2' in name]

folder_lst = [f.split('.')[0].split('/')[-1] for f in file_lst]
folder_lst.sort(key=lambda x: int(str(x).split('_')[0]), reverse=False)

df['trial_folder'] = folder_lst

df.to_pickle(os.path.join(DATA_PATH, "all_trials_summary.pkl"))  
df.head()

Unpack all folders

In [ ]:
for i in df.trial_id:
    file = [f for f in file_lst if str(i) + '_' in f]
    print(i, *file)
    cmd = f'tar -jxf {file[0]} --directory {DATA_PATH}'
    subprocess.call(cmd, shell=True)

Genarate Dataframe with all Trials

In [ ]:
%%time
def write_unique_id(frame_id, episode_id, trial_id):
    if not pd.isna(episode_id) and not pd.isna(frame_id):
        unique_id = str(trial_id)+ '_' + '_'.join(frame_id.split('_')[:2]) + '_E{:01d}'.format(int(episode_id))
    elif not pd.isna(frame_id):
        unique_id = str(trial_id)+ '_' + '_'.join(frame_id.split('_')[:2]) + '_E0'
    else: 
        unique_id = None
    return unique_id


path = os.path.join(DATA_PATH, df.iloc[0].trial_folder)
print(path) 
    
# Read Annotations
trial_df = utils.txt_to_dataframe(path + '.txt')  

# Write unique ID
trial_df['ID'] = trial_df.apply(lambda x: write_unique_id(x['frame_id'], x['episode_id'], df.iloc[0].trial_id), axis=1)

# Write image paths
trial_df['img_path'] = trial_df.apply(lambda x: os.path.join(path, str(x['frame_id']) + '.png'), axis=1)

# Reorder columns
cols = ['ID'] + [c for c in trial_df.columns.tolist() if not c=='ID'] 
trial_df = trial_df[cols]

# Cut frames without annotations
trial_df = trial_df[trial_df.ID.notnull()] 

print(f'Episodes: {trial_df.ID.unique()}\n')

full_df = trial_df.copy()

for idx in df.index[1:]:
    row = df.iloc[idx]
    if row.GameName == 'montezuma_revenge':
        path = os.path.join(DATA_PATH, row.trial_folder)
    elif row.GameName == 'montezuma_revenge_highscore':
        path = os.path.join(DATA_PATH, 'highscore', row.trial_folder)
    else: 
        path = ''
        warnings.warn(f"GameName of row {idx} not recognised! Returning empty path.")
    print(f'Reading {path}')
    
    # Read Annotations
    trial_df = utils.txt_to_dataframe(path + '.txt')    
    
    # Write unique ID 
    trial_df['ID'] = trial_df.apply(lambda x: write_unique_id(x['frame_id'], x['episode_id'], row.trial_id), axis=1)
    
    # Write image paths
    trial_df['img_path'] = trial_df.apply(lambda x: os.path.join(path, str(x['frame_id']) + '.png'), axis=1)

    # Cut frames without annotations
    trial_df = trial_df[trial_df.ID.notnull()] 

    print(f'Episodes: {trial_df.ID.unique()}\n')
    full_df = pd.concat([full_df, trial_df], join='inner', ignore_index=True)

outpath = os.path.join(DATA_PATH, "all_trials.pkl")
print(f'Saving dataframe to {outpath}\n')

full_df.to_pickle(outpath)
full_df.head()