import os import time import cv2 import torch import numpy as np import imageio import wandb from datetime import datetime from ddqn_agent import Agent from atari_env import ALEEnvironment from args import HIntArgumentParser from metacontroller import MetaController GPU_DEVICE = 7 VERBOSE = 2 DEBUG = False TEST = False STOP_TRAINING_THRESHOLD = 0.90 EPISODES = 1000000 MAX_EPISODE_STEPS = 1000 RANDOM_PLAY_STEPS = 20000 # Use subgoals from gaze analysis GOAL_MODE = 'full_sequence' # 'full_sequence' 'unique' ALL_SUBGOALS = np.loadtxt('subgoals.txt', dtype=int, delimiter=',') SUBGOAL_ORDER = [8, 6, 1, 0, 2, 7, 2, 0, 1, 6, 8, 9] GOALS = len(SUBGOAL_ORDER) if GOAL_MODE == 'full_sequence' else len(np.unique(SUBGOAL_ORDER)) SUBGOALs = SUBGOAL_ORDER if GOAL_MODE == 'full_sequence' else np.array(SUBGOAL_ORDER)[np.sort(np.unique(SUBGOAL_ORDER, return_index=True)[1])] # unsorted unique -> keep subgoal appearance order TRAINED_GOALS = [False] * GOALS def main(): # init random seed RANDOM_SEED = np.random.randint(100) print(f'Setting random seed to {RANDOM_SEED}') os.environ['PYTHONHASHSEED']=str(RANDOM_SEED) np.random.seed(RANDOM_SEED) torch.manual_seed(RANDOM_SEED) x = datetime.now() TIME_STAMP = x.strftime("%d%b%y-%H%M%S") MODEL_NAME = 'hInt-RL-full_' + str(RANDOM_SEED) + '_' + TIME_STAMP os.environ['CUDA_VISIBLE_DEVICES']=str(GPU_DEVICE) os.environ['CUDA_LAUNCH_BLOCKING'] = str(1.0) actionMap = [0, 1, 2, 3, 4, 5, 11, 12] actionExplain = ['no action', 'jump', 'up', 'right', 'left', 'down', 'jump right', 'jump left'] actionVectors = [[0,0], [0.0,-1.0], [0.0,-1.0], [1.0,0.0], [-1.0,0.0], [0.0,1.0], [0.7071067811865475,-0.7071067811865475], [-0.7071067811865475,-0.7071067811865475]] inv_label_mapping = {} for i, l in enumerate(SUBGOALs): # np.unique without sorting the values inv_label_mapping[l] = i print(inv_label_mapping) goalExplain = {8: 'start', 6: 'rope', 1: 'lower right ladder', 0: 'danger zone', 2: 'lower left ladder', 7: 'key', 9:'left door'} subgoal_success_tracker = [[0] for i in range(GOALS)] subgoal_success_steps = [[] for i in range(GOALS)] subgoal_trailing_performance = [0.0 for _ in range(GOALS)] parser = HIntArgumentParser() parser.set_common_args() args = parser.parse_args(['--model_name', MODEL_NAME, '--verbose', str(VERBOSE), '--random_seed', str(RANDOM_SEED), '--num_goals', str(GOALS), '--goal_mode', GOAL_MODE]) if not DEBUG: wandb.init(project="hInt-RL", config=vars(args), name=MODEL_NAME) print(*[f'\n[PARAM] {k}: {v}' for (k,v) in vars(args).items()]) print() # Setup environment env = ALEEnvironment(args, device=f"cuda:{GPU_DEVICE}") input_shape = env.getStackedState().shape print(f'Input shape {input_shape}\n') device = torch.device(f"cuda:{GPU_DEVICE}" if torch.cuda.is_available() else "cpu") # Setup metacontroller metacontroller = MetaController(device=device, args=args, input_shape=input_shape, n_goals=GOALS, hidden_nodes=args.hidden_nodes) # Setup agents and their DQN models agent_list = [] for goal in range(sum(TRAINED_GOALS), GOALS): print(f'Setting up agent for goal {goalExplain.get(SUBGOALs[goal])}') agent = Agent(device=device, goal=goal, input_shape=input_shape, n_actions=len(actionMap), random_play_steps=20000, args=args) agent_list.append(agent) total_rewards = [] wrong_meta_pred = 0 total_steps = 0 create_video = False for episode in range(EPISODES): if args.verbose >= 1: print(f'\n[Episode {episode}] Starting new episode...') # Initialize environment and get state env.restart() state = env.getStackedState() episode_steps = 0 subgoal_agent_loss = [0.0 for _ in range(GOALS)] meta_labels = [] wrong_goal = False goal_idx = 0 true_goal = goal_idx if GOAL_MODE == 'full_sequence' else inv_label_mapping.get(SUBGOAL_ORDER[goal_idx]) img_array = [] if episode % 10000 == 0: create_video = True expert_goal = np.zeros((1, GOALS)) expert_goal[0, true_goal] = 1.0 meta_labels.append((state, expert_goal)) # append, but do not collect yet if TRAINED_GOALS[true_goal]: goal = true_goal # Collect sample for metacontroller because goal will be reached metacontroller.collect(state, expert_goal) else: # Metacontroller predict goal goal = metacontroller.sample(metacontroller.predict(state)) if goal!= true_goal: wrong_goal = True if args.verbose >= 1: print(f"[Episode {episode}] Metacontroller predicted {goal} instead of {true_goal} as goal \U0001F47E ") wrong_meta_pred += 1 if wrong_meta_pred % 100 == 0: print(f'[Episode {episode}] 100 wrong meta choices. Resetting metacontroller... ') metacontroller.reset() else: if args.verbose >= 1: print(f'[Episode {episode}] Metacontroller predicted goal {goal} as new goal...') all_step_times = [] step_time = time.time() while not env.isTerminal() and episode_steps < MAX_EPISODE_STEPS and not wrong_goal: # Unroll episode until agent reaches goal while not env.trueGoalReached(SUBGOAL_ORDER[goal_idx]) and not env.isTerminal() and episode_steps < MAX_EPISODE_STEPS and not wrong_goal: goal_position = torch.tensor(np.array([ALL_SUBGOALS[SUBGOAL_ORDER[goal_idx]]]), device=device) if create_video: img = env.getScreenOrig() cv2.putText(img=img, text='goal : ' + goalExplain.get(SUBGOAL_ORDER[goal_idx]), org=(5, 205), fontFace=cv2.FONT_HERSHEY_SIMPLEX, fontScale=0.3, color=(255, 255, 255),thickness=1) img_array.append(img) action = agent_list[goal].select_action(state.unsqueeze(0)) external_reward = env.act(actionMap[action]) if (external_reward != 0): external_reward = 1.0 all_step_times.append(time.time() - step_time) step_time = time.time() episode_steps += 1 # Calculate intrinsic reward [optionally add distance reward] reward = 0.0 if env.trueGoalReached(SUBGOAL_ORDER[goal_idx]): reward += 1.0 if env.isTerminal() or episode_steps==MAX_EPISODE_STEPS: reward -= 1.0 """ # Simple direction reward goal_vec = env.get_goal_direction(goal) action_vec = actionVectors[action] direction_reward = np.dot(action_vec, goal_vec) / 100 reward += direction_reward """ """ # Distance Reward # Query agent location every 20 steps (otherwise training is too slow) if episode_steps % 20 == 0: env.detect_agent() reward += env.distanceReward(lastGoal=(true_goal-1), goal=goal) """ total_rewards.append(reward) reward = torch.tensor([reward], device=device) next_state = env.getStackedState() # Store transition and update network parameters agent_list[goal].remember(state, action, reward, next_state, env.isTerminal(), goal_position) # Move to the next state state = next_state # Optimize the policy network agent_loss = agent_list[goal].optimize_model() if agent_loss is not None: subgoal_agent_loss[goal] += agent_loss # Update goal if episode_steps >= MAX_EPISODE_STEPS: subgoal_success_tracker[goal].append(0) if args.verbose >= 1: print(f'[Episode {episode}] Reached maximum epsiode steps: {episode_steps}. Terminate episode.') break elif env.trueGoalReached(SUBGOAL_ORDER[goal_idx]): if args.verbose >= 1: print(f'[Episode {episode}] Goal reached! \U0001F389 after step #{episode_steps}. ') subgoal_success_tracker[goal].append(1) subgoal_success_steps[goal].append(episode_steps) # Predict new goal and continue if its true goal goal_idx += 1 if goal_idx == len(SUBGOAL_ORDER): # Finished all options --> start new episode wrong_goal = True print(f"[Episode {episode}] Reached all goals! \U0001F38A \U0001F38A \U0001F38A") break true_goal = goal_idx if GOAL_MODE == 'full_sequence' else inv_label_mapping.get(SUBGOAL_ORDER[goal_idx]) expert_goal = np.zeros((1, GOALS)) expert_goal[0, true_goal] = 1.0 meta_labels.append((state, expert_goal)) # append, but do not collect yet # Metacontroller predict goal goal = metacontroller.sample(metacontroller.predict(state)) if goal!= true_goal: wrong_goal = True if args.verbose >= 1: print(f"[Episode {episode}] Metacontroller predicted {goal} instead of {true_goal} as goal \U0001F47E ") wrong_meta_pred += 1 if wrong_meta_pred % 100 == 0: if args.verbose >= 1: print(f'[Episode {episode}] 100 wrong meta choices. Resetting metacontroller... ') metacontroller.reset() break else: # Continue with new goal if args.verbose >= 1: print(f'[Episode {episode}] Metacontroller predicted goal {goal} as new goal...') else: subgoal_success_tracker[goal].append(0) if args.verbose >= 1: print(f'[Episode {episode}] Agent killed after {episode_steps} steps \U0001F47E Terminate episode.') break if create_video and img_array: out_path = f'runs/video_run{TIME_STAMP}-E{episode}.gif' imageio.mimsave(out_path, img_array) print('Saved gif to ', out_path) create_video = False """END OF EPISODE""" # At end of episode: aggregate data for metacontroller item = meta_labels[-1] metacontroller.collect(item[0], item[1]) # Log agent losses if agent was running total_steps += episode_steps avg_step_time = sum(all_step_times) / len(all_step_times) if episode_steps > 0 else 0 #print(f'[Episode {episode}] episode time: {sum(all_step_times) / 60:.2f}min average step time: {avg_step_time*100:.1f}ms') if not DEBUG: wandb.log({'mean_reward': np.array(total_rewards[-episode_steps:]).mean(), 'avg_step_time': avg_step_time*100, 'episode': episode}) for g in range(GOALS): if len(subgoal_success_tracker[g]) < episode: # goal not evaluated yet because other goals not reached subgoal_success_tracker[g].append(0) # agent_loss = subgoal_agent_loss[g] / episode_steps if episode_steps > 0 else 0 # log_steps = episode_steps if (subgoal_success_tracker[g] and subgoal_success_tracker[g][-1] == 1) else -episode_steps if not DEBUG: wandb.log({ f'sub-goal {g}': subgoal_success_tracker[g][-1], 'episode': episode}) #f'agent{g}_loss': agent_loss, f'agent{g}_steps': log_steps, # Train metacontroller if metacontroller.check_training_clock(): if args.verbose >= 2: print('###################################') print('### Training Metacontroller ###') print('###################################') meta_loss, meta_acc = metacontroller.train() if not DEBUG: wandb.log({'meta_loss': meta_loss, 'meta_acc': meta_acc, 'episode': episode}) if args.verbose >= 2: print(f'Metacontroller loss: {meta_loss:.2f}\n') print('###################################') if len(subgoal_success_tracker[goal]) > 100: subgoal_trailing_performance[goal] = sum([v for v in subgoal_success_tracker[goal][-100:] if not v is None])/ 100.0 if args.verbose >= 0: print(f'[Episode {episode}] Subgoal trailing performance for goal {goal} is {subgoal_trailing_performance[goal]:.2f}') print(f'[Episode {episode}] Subgoal agent {goal} steps done {agent_list[goal].steps_done}') print(f'[Episode {episode}] Subgoal agent {goal} epsilon value is {agent_list[goal].get_epsilon():.2f}') if subgoal_trailing_performance[goal] > STOP_TRAINING_THRESHOLD: if args.verbose >= 1: print(f'[Episode {episode}] Training for goal #{goal} completed...') if not agent_list[goal].training_done: agent_list[goal].finish_training(TIME_STAMP) TRAINED_GOALS[goal] = True if goal_idx == len(SUBGOAL_ORDER): # Last goal reached trailing performance --> stop training print(f"\nTraining completed \U0001F38A \U0001F38A \U0001F38A \n") torch.save(metacontroller.model.state_dict(), f"runs/run{TIME_STAMP}-metacontroller.pt") break # After training is completed for goal in range(GOALS): if subgoal_success_steps[goal]: print(f'\nAgent{goal} performance: {np.array(subgoal_success_tracker[goal]).mean():.2%}') print(f'Reached goal {goal} {sum(subgoal_success_tracker[goal])}x with {np.array(subgoal_success_steps[goal]).mean():.0f} average steps') if __name__ == "__main__": main()