Int-HRL/agent/run_experiment.py

import os
import time

import cv2
import torch
import numpy as np
import imageio
import wandb
from datetime import datetime

from ddqn_agent import Agent
from atari_env import ALEEnvironment
from args import HIntArgumentParser
from metacontroller import MetaController


GPU_DEVICE = 7
VERBOSE = 2
DEBUG = False
TEST = False

STOP_TRAINING_THRESHOLD = 0.90
EPISODES = 1000000
MAX_EPISODE_STEPS = 1000
RANDOM_PLAY_STEPS = 20000


# Use subgoals from gaze analysis
GOAL_MODE = 'full_sequence' # 'full_sequence' 'unique'
ALL_SUBGOALS = np.loadtxt('subgoals.txt', dtype=int, delimiter=',')
SUBGOAL_ORDER = [8, 6, 1, 0, 2, 7, 2, 0, 1, 6, 8, 9]
GOALS = len(SUBGOAL_ORDER) if GOAL_MODE == 'full_sequence' else len(np.unique(SUBGOAL_ORDER))
SUBGOALs = SUBGOAL_ORDER if GOAL_MODE == 'full_sequence' else np.array(SUBGOAL_ORDER)[np.sort(np.unique(SUBGOAL_ORDER, return_index=True)[1])] # unsorted unique -> keep subgoal appearance order
TRAINED_GOALS = [False] * GOALS

def main():

    # init random seed
    RANDOM_SEED = np.random.randint(100)
    print(f'Setting random seed to {RANDOM_SEED}')
    os.environ['PYTHONHASHSEED']=str(RANDOM_SEED)
    np.random.seed(RANDOM_SEED)
    torch.manual_seed(RANDOM_SEED)

    x = datetime.now()

    TIME_STAMP = x.strftime("%d%b%y-%H%M%S")
    MODEL_NAME = 'hInt-RL-full_' + str(RANDOM_SEED) + '_' + TIME_STAMP

    os.environ['CUDA_VISIBLE_DEVICES']=str(GPU_DEVICE)
    os.environ['CUDA_LAUNCH_BLOCKING'] = str(1.0)

    actionMap = [0, 1, 2, 3, 4, 5, 11, 12]
    actionExplain = ['no action', 'jump', 'up', 'right', 'left', 'down', 'jump right', 'jump left']
    actionVectors = [[0,0], [0.0,-1.0], [0.0,-1.0], [1.0,0.0], [-1.0,0.0], [0.0,1.0], [0.7071067811865475,-0.7071067811865475], [-0.7071067811865475,-0.7071067811865475]]

    inv_label_mapping = {}
    for i, l in enumerate(SUBGOALs):  # np.unique without sorting the values
        inv_label_mapping[l] = i
    print(inv_label_mapping)

    goalExplain = {8: 'start', 6: 'rope', 1: 'lower right ladder', 0: 'danger zone', 2: 'lower left ladder',  7: 'key', 9:'left door'}

    subgoal_success_tracker = [[0] for i in range(GOALS)]
    subgoal_success_steps = [[] for i in range(GOALS)]
    subgoal_trailing_performance = [0.0 for _ in range(GOALS)]

    parser = HIntArgumentParser()
    parser.set_common_args()

    args = parser.parse_args(['--model_name', MODEL_NAME, '--verbose', str(VERBOSE), '--random_seed', str(RANDOM_SEED),
                              '--num_goals', str(GOALS), '--goal_mode', GOAL_MODE])

    if not DEBUG:
        wandb.init(project="hInt-RL", config=vars(args), name=MODEL_NAME)

    print(*[f'\n[PARAM] {k}: {v}' for (k,v) in vars(args).items()])
    print()

    # Setup environment
    env = ALEEnvironment(args, device=f"cuda:{GPU_DEVICE}")
    input_shape = env.getStackedState().shape
    print(f'Input shape {input_shape}\n')

    device = torch.device(f"cuda:{GPU_DEVICE}" if torch.cuda.is_available() else "cpu")

    # Setup metacontroller
    metacontroller = MetaController(device=device, args=args, input_shape=input_shape, n_goals=GOALS, hidden_nodes=args.hidden_nodes)

    # Setup agents and their DQN models
    agent_list = []

    for goal in range(sum(TRAINED_GOALS), GOALS):
        print(f'Setting up agent for goal {goalExplain.get(SUBGOALs[goal])}')

        agent = Agent(device=device, goal=goal, input_shape=input_shape, n_actions=len(actionMap), random_play_steps=20000, args=args)
        agent_list.append(agent)


    total_rewards = []
    wrong_meta_pred = 0
    total_steps = 0
    create_video = False

    for episode in range(EPISODES):
        if args.verbose >= 1:
            print(f'\n[Episode {episode}] Starting new episode...')

        # Initialize environment and get state
        env.restart()
        state = env.getStackedState()

        episode_steps = 0
        subgoal_agent_loss = [0.0 for _ in range(GOALS)]
        meta_labels = []
        wrong_goal = False
        goal_idx = 0
        true_goal = goal_idx if GOAL_MODE == 'full_sequence' else inv_label_mapping.get(SUBGOAL_ORDER[goal_idx])

        img_array = []
        if episode % 10000 == 0:
            create_video = True

        expert_goal = np.zeros((1, GOALS))
        expert_goal[0, true_goal] = 1.0
        meta_labels.append((state, expert_goal)) # append, but do not collect yet

        if TRAINED_GOALS[true_goal]:
            goal = true_goal
            # Collect sample for metacontroller because goal will be reached
            metacontroller.collect(state, expert_goal)
        else:
            # Metacontroller predict goal
            goal = metacontroller.sample(metacontroller.predict(state))

        if goal!= true_goal:
            wrong_goal = True
            if args.verbose >= 1:
                print(f"[Episode {episode}] Metacontroller predicted {goal} instead of {true_goal} as goal \U0001F47E ")
            wrong_meta_pred += 1
            if wrong_meta_pred % 100 == 0:
                print(f'[Episode {episode}] 100 wrong meta choices. Resetting metacontroller... ')
                metacontroller.reset()
        else:
            if args.verbose >= 1:
                print(f'[Episode {episode}] Metacontroller predicted goal {goal} as new goal...')

        all_step_times = []
        step_time = time.time()

        while not env.isTerminal() and episode_steps < MAX_EPISODE_STEPS and not wrong_goal:

            # Unroll episode until agent reaches goal
            while not env.trueGoalReached(SUBGOAL_ORDER[goal_idx]) and not env.isTerminal() and episode_steps < MAX_EPISODE_STEPS and not wrong_goal:

                goal_position = torch.tensor(np.array([ALL_SUBGOALS[SUBGOAL_ORDER[goal_idx]]]), device=device)

                if create_video:
                    img = env.getScreenOrig()

                    cv2.putText(img=img, text='goal : ' + goalExplain.get(SUBGOAL_ORDER[goal_idx]), org=(5, 205), fontFace=cv2.FONT_HERSHEY_SIMPLEX,
                                fontScale=0.3, color=(255, 255, 255),thickness=1)

                    img_array.append(img)

                action = agent_list[goal].select_action(state.unsqueeze(0))

                external_reward = env.act(actionMap[action])
                if (external_reward != 0):
                    external_reward = 1.0

                all_step_times.append(time.time() - step_time)
                step_time = time.time()

                episode_steps += 1

                # Calculate intrinsic reward [optionally add distance reward]
                reward = 0.0
                if env.trueGoalReached(SUBGOAL_ORDER[goal_idx]):
                    reward += 1.0
                if env.isTerminal() or episode_steps==MAX_EPISODE_STEPS:
                    reward -= 1.0

                """
                # Simple direction reward
                goal_vec = env.get_goal_direction(goal)
                action_vec = actionVectors[action]
                direction_reward = np.dot(action_vec, goal_vec) / 100

                reward += direction_reward
                """

                """
                # Distance Reward
                # Query agent location every 20 steps (otherwise training is too slow)
                if episode_steps % 20 == 0:
                    env.detect_agent()

                reward += env.distanceReward(lastGoal=(true_goal-1), goal=goal)
                """

                total_rewards.append(reward)
                reward = torch.tensor([reward], device=device)

                next_state = env.getStackedState()

                # Store transition and update network parameters
                agent_list[goal].remember(state, action, reward, next_state, env.isTerminal(), goal_position)

                # Move to the next state
                state = next_state

                # Optimize the policy network
                agent_loss = agent_list[goal].optimize_model()
                if agent_loss is not None:
                    subgoal_agent_loss[goal] += agent_loss

            # Update goal
            if episode_steps >= MAX_EPISODE_STEPS:
                subgoal_success_tracker[goal].append(0)
                if args.verbose >= 1:
                    print(f'[Episode {episode}] Reached maximum epsiode steps: {episode_steps}. Terminate episode.')
                break

            elif env.trueGoalReached(SUBGOAL_ORDER[goal_idx]):
                if args.verbose >= 1:
                    print(f'[Episode {episode}] Goal reached! \U0001F389 after step #{episode_steps}. ')

                subgoal_success_tracker[goal].append(1)
                subgoal_success_steps[goal].append(episode_steps)

                # Predict new goal and continue if its true goal
                goal_idx += 1

                if goal_idx == len(SUBGOAL_ORDER):
                    # Finished all options --> start new episode
                    wrong_goal = True
                    print(f"[Episode {episode}] Reached all goals! \U0001F38A \U0001F38A \U0001F38A")
                    break

                true_goal = goal_idx if GOAL_MODE == 'full_sequence' else inv_label_mapping.get(SUBGOAL_ORDER[goal_idx])
                expert_goal = np.zeros((1, GOALS))
                expert_goal[0, true_goal] = 1.0
                meta_labels.append((state, expert_goal)) # append, but do not collect yet

                # Metacontroller predict goal
                goal = metacontroller.sample(metacontroller.predict(state))

                if goal!= true_goal:
                    wrong_goal = True
                    if args.verbose >= 1:
                        print(f"[Episode {episode}] Metacontroller predicted {goal} instead of {true_goal} as goal \U0001F47E ")
                    wrong_meta_pred += 1
                    if wrong_meta_pred % 100 == 0:
                        if args.verbose >= 1:
                            print(f'[Episode {episode}] 100 wrong meta choices. Resetting metacontroller... ')
                        metacontroller.reset()
                    break
                else:
                    # Continue with new goal
                    if args.verbose >= 1:
                        print(f'[Episode {episode}] Metacontroller predicted goal {goal} as new goal...')

            else:
                subgoal_success_tracker[goal].append(0)
                if args.verbose >= 1:
                    print(f'[Episode {episode}] Agent killed after {episode_steps} steps \U0001F47E Terminate episode.')
                break

        if create_video and img_array:
            out_path = f'runs/video_run{TIME_STAMP}-E{episode}.gif'
            imageio.mimsave(out_path, img_array)
            print('Saved gif to ', out_path)
            create_video = False

        """END OF EPISODE"""
        # At end of episode: aggregate data for metacontroller
        item = meta_labels[-1]
        metacontroller.collect(item[0], item[1])

        # Log agent losses if agent was running
        total_steps += episode_steps
        avg_step_time = sum(all_step_times) / len(all_step_times) if episode_steps > 0 else 0
        #print(f'[Episode {episode}] episode time: {sum(all_step_times) / 60:.2f}min average step time: {avg_step_time*100:.1f}ms')

        if not DEBUG:
            wandb.log({'mean_reward': np.array(total_rewards[-episode_steps:]).mean(), 'avg_step_time': avg_step_time*100, 'episode': episode})

        for g in range(GOALS):
            if len(subgoal_success_tracker[g]) < episode:
                    # goal not evaluated yet because other goals not reached
                    subgoal_success_tracker[g].append(0)
            # agent_loss = subgoal_agent_loss[g] / episode_steps if episode_steps > 0 else 0
            # log_steps = episode_steps if (subgoal_success_tracker[g] and subgoal_success_tracker[g][-1] == 1) else -episode_steps
            if not DEBUG:
                wandb.log({ f'sub-goal {g}': subgoal_success_tracker[g][-1], 'episode': episode}) #f'agent{g}_loss': agent_loss, f'agent{g}_steps': log_steps,


        # Train metacontroller
        if metacontroller.check_training_clock():
            if args.verbose >= 2:
                print('###################################')
                print('###   Training Metacontroller   ###')
                print('###################################')

            meta_loss, meta_acc = metacontroller.train()
            if not DEBUG:
                wandb.log({'meta_loss': meta_loss, 'meta_acc': meta_acc, 'episode': episode})

            if args.verbose >= 2:
                print(f'Metacontroller loss: {meta_loss:.2f}\n')
                print('###################################')

        if len(subgoal_success_tracker[goal]) > 100:
            subgoal_trailing_performance[goal] = sum([v for v in subgoal_success_tracker[goal][-100:] if not v is None])/ 100.0

            if args.verbose >= 0:
                print(f'[Episode {episode}] Subgoal trailing performance for goal {goal} is {subgoal_trailing_performance[goal]:.2f}')
                print(f'[Episode {episode}] Subgoal agent {goal} steps done {agent_list[goal].steps_done}')
                print(f'[Episode {episode}] Subgoal agent {goal} epsilon value is {agent_list[goal].get_epsilon():.2f}')

            if subgoal_trailing_performance[goal] > STOP_TRAINING_THRESHOLD:
                if args.verbose >= 1:
                    print(f'[Episode {episode}] Training for goal #{goal} completed...')
                if not agent_list[goal].training_done:
                    agent_list[goal].finish_training(TIME_STAMP)
                    TRAINED_GOALS[goal] = True

                if goal_idx == len(SUBGOAL_ORDER):
                    # Last goal reached trailing performance --> stop training
                    print(f"\nTraining completed \U0001F38A \U0001F38A \U0001F38A \n")
                    torch.save(metacontroller.model.state_dict(), f"runs/run{TIME_STAMP}-metacontroller.pt")
                    break

    # After training is completed
    for goal in range(GOALS):
        if subgoal_success_steps[goal]:
            print(f'\nAgent{goal} performance: {np.array(subgoal_success_tracker[goal]).mean():.2%}')
            print(f'Reached goal {goal} {sum(subgoal_success_tracker[goal])}x with {np.array(subgoal_success_steps[goal]).mean():.0f} average steps')


if __name__ == "__main__":
    main()