{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d094257c",
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import numpy as np \n",
    "import cv2 \n",
    "import pandas as pd\n",
    "import matplotlib.pyplot as plt \n",
    "import subprocess\n",
    "import warnings\n",
    "\n",
    "import dataset_utils as utils"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "dbabd791",
   "metadata": {},
   "source": [
    "\n",
    "### Read all Montezuma's Revenge trials"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "9e42d862",
   "metadata": {},
   "outputs": [],
   "source": [
    "DATA_PATH = 'montezuma_revenge/'\n",
    "\n",
    "df = pd.read_csv(os.path.join(DATA_PATH, 'meta_data.csv'))\n",
    "df = df.loc[df.GameName.str.contains('montezuma_revenge')]\n",
    "df.sort_values(by=['trial_id'], inplace=True, ascending=True)\n",
    "df.reset_index(drop=True, inplace=True)\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "e42160ea",
   "metadata": {},
   "source": [
    "#### Get folder names of each trial"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a9175b8f",
   "metadata": {},
   "outputs": [],
   "source": [
    "file_lst = [os.path.join(root, name) for root, dirs, files in os.walk(DATA_PATH) for name in files if 'tar.bz2' in name]\n",
    "\n",
    "folder_lst = [f.split('.')[0].split('/')[-1] for f in file_lst]\n",
    "folder_lst.sort(key=lambda x: int(str(x).split('_')[0]), reverse=False)\n",
    "\n",
    "df['trial_folder'] = folder_lst\n",
    "\n",
    "df.to_pickle(os.path.join(DATA_PATH, \"all_trials_summary.pkl\"))  \n",
    "df.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "28f38343",
   "metadata": {},
   "source": [
    "#### Unpack all folders"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c83443d7",
   "metadata": {},
   "outputs": [],
   "source": [
    "for i in df.trial_id:\n",
    "    file = [f for f in file_lst if str(i) + '_' in f]\n",
    "    print(i, *file)\n",
    "    cmd = f'tar -jxf {file[0]} --directory {DATA_PATH}'\n",
    "    subprocess.call(cmd, shell=True)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "b65c1efb",
   "metadata": {},
   "source": [
    "## Genarate Dataframe with all Trials"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "24ec2a88",
   "metadata": {
    "scrolled": false
   },
   "outputs": [],
   "source": [
    "%%time\n",
    "def write_unique_id(frame_id, episode_id, trial_id):\n",
    "    if not pd.isna(episode_id) and not pd.isna(frame_id):\n",
    "        unique_id = str(trial_id)+ '_' + '_'.join(frame_id.split('_')[:2]) + '_E{:01d}'.format(int(episode_id))\n",
    "    elif not pd.isna(frame_id):\n",
    "        unique_id = str(trial_id)+ '_' + '_'.join(frame_id.split('_')[:2]) + '_E0'\n",
    "    else: \n",
    "        unique_id = None\n",
    "    return unique_id\n",
    "\n",
    "\n",
    "path = os.path.join(DATA_PATH, df.iloc[0].trial_folder)\n",
    "print(path) \n",
    "    \n",
    "# Read Annotations\n",
    "trial_df = utils.txt_to_dataframe(path + '.txt')  \n",
    "\n",
    "# Write unique ID\n",
    "trial_df['ID'] = trial_df.apply(lambda x: write_unique_id(x['frame_id'], x['episode_id'], df.iloc[0].trial_id), axis=1)\n",
    "\n",
    "# Write image paths\n",
    "trial_df['img_path'] = trial_df.apply(lambda x: os.path.join(path, str(x['frame_id']) + '.png'), axis=1)\n",
    "\n",
    "# Reorder columns\n",
    "cols = ['ID'] + [c for c in trial_df.columns.tolist() if not c=='ID'] \n",
    "trial_df = trial_df[cols]\n",
    "\n",
    "# Cut frames without annotations\n",
    "trial_df = trial_df[trial_df.ID.notnull()] \n",
    "\n",
    "print(f'Episodes: {trial_df.ID.unique()}\\n')\n",
    "\n",
    "full_df = trial_df.copy()\n",
    "\n",
    "for idx in df.index[1:]:\n",
    "    row = df.iloc[idx]\n",
    "    if row.GameName == 'montezuma_revenge':\n",
    "        path = os.path.join(DATA_PATH, row.trial_folder)\n",
    "    elif row.GameName == 'montezuma_revenge_highscore':\n",
    "        path = os.path.join(DATA_PATH, 'highscore', row.trial_folder)\n",
    "    else: \n",
    "        path = ''\n",
    "        warnings.warn(f\"GameName of row {idx} not recognised! Returning empty path.\")\n",
    "    print(f'Reading {path}')\n",
    "    \n",
    "    # Read Annotations\n",
    "    trial_df = utils.txt_to_dataframe(path + '.txt')    \n",
    "    \n",
    "    # Write unique ID \n",
    "    trial_df['ID'] = trial_df.apply(lambda x: write_unique_id(x['frame_id'], x['episode_id'], row.trial_id), axis=1)\n",
    "    \n",
    "    # Write image paths\n",
    "    trial_df['img_path'] = trial_df.apply(lambda x: os.path.join(path, str(x['frame_id']) + '.png'), axis=1)\n",
    "\n",
    "    # Cut frames without annotations\n",
    "    trial_df = trial_df[trial_df.ID.notnull()] \n",
    "\n",
    "    print(f'Episodes: {trial_df.ID.unique()}\\n')\n",
    "    full_df = pd.concat([full_df, trial_df], join='inner', ignore_index=True)\n",
    "\n",
    "outpath = os.path.join(DATA_PATH, \"all_trials.pkl\")\n",
    "print(f'Saving dataframe to {outpath}\\n')\n",
    "\n",
    "full_df.to_pickle(outpath)\n",
    "full_df.head()"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.13"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}