343 lines
14 KiB
Python
343 lines
14 KiB
Python
import os
|
|
import time
|
|
|
|
import cv2
|
|
import torch
|
|
import numpy as np
|
|
import imageio
|
|
import wandb
|
|
from datetime import datetime
|
|
|
|
from ddqn_agent import Agent
|
|
from atari_env import ALEEnvironment
|
|
from args import HIntArgumentParser
|
|
from metacontroller import MetaController
|
|
|
|
|
|
GPU_DEVICE = 7
|
|
VERBOSE = 2
|
|
DEBUG = False
|
|
TEST = False
|
|
|
|
STOP_TRAINING_THRESHOLD = 0.90
|
|
EPISODES = 1000000
|
|
MAX_EPISODE_STEPS = 1000
|
|
RANDOM_PLAY_STEPS = 20000
|
|
|
|
|
|
# Use subgoals from gaze analysis
|
|
GOAL_MODE = 'full_sequence' # 'full_sequence' 'unique'
|
|
ALL_SUBGOALS = np.loadtxt('subgoals.txt', dtype=int, delimiter=',')
|
|
SUBGOAL_ORDER = [8, 6, 1, 0, 2, 7, 2, 0, 1, 6, 8, 9]
|
|
GOALS = len(SUBGOAL_ORDER) if GOAL_MODE == 'full_sequence' else len(np.unique(SUBGOAL_ORDER))
|
|
SUBGOALs = SUBGOAL_ORDER if GOAL_MODE == 'full_sequence' else np.array(SUBGOAL_ORDER)[np.sort(np.unique(SUBGOAL_ORDER, return_index=True)[1])] # unsorted unique -> keep subgoal appearance order
|
|
TRAINED_GOALS = [False] * GOALS
|
|
|
|
def main():
|
|
|
|
# init random seed
|
|
RANDOM_SEED = np.random.randint(100)
|
|
print(f'Setting random seed to {RANDOM_SEED}')
|
|
os.environ['PYTHONHASHSEED']=str(RANDOM_SEED)
|
|
np.random.seed(RANDOM_SEED)
|
|
torch.manual_seed(RANDOM_SEED)
|
|
|
|
x = datetime.now()
|
|
|
|
TIME_STAMP = x.strftime("%d%b%y-%H%M%S")
|
|
MODEL_NAME = 'hInt-RL-full_' + str(RANDOM_SEED) + '_' + TIME_STAMP
|
|
|
|
os.environ['CUDA_VISIBLE_DEVICES']=str(GPU_DEVICE)
|
|
os.environ['CUDA_LAUNCH_BLOCKING'] = str(1.0)
|
|
|
|
actionMap = [0, 1, 2, 3, 4, 5, 11, 12]
|
|
actionExplain = ['no action', 'jump', 'up', 'right', 'left', 'down', 'jump right', 'jump left']
|
|
actionVectors = [[0,0], [0.0,-1.0], [0.0,-1.0], [1.0,0.0], [-1.0,0.0], [0.0,1.0], [0.7071067811865475,-0.7071067811865475], [-0.7071067811865475,-0.7071067811865475]]
|
|
|
|
inv_label_mapping = {}
|
|
for i, l in enumerate(SUBGOALs): # np.unique without sorting the values
|
|
inv_label_mapping[l] = i
|
|
print(inv_label_mapping)
|
|
|
|
goalExplain = {8: 'start', 6: 'rope', 1: 'lower right ladder', 0: 'danger zone', 2: 'lower left ladder', 7: 'key', 9:'left door'}
|
|
|
|
subgoal_success_tracker = [[0] for i in range(GOALS)]
|
|
subgoal_success_steps = [[] for i in range(GOALS)]
|
|
subgoal_trailing_performance = [0.0 for _ in range(GOALS)]
|
|
|
|
parser = HIntArgumentParser()
|
|
parser.set_common_args()
|
|
|
|
args = parser.parse_args(['--model_name', MODEL_NAME, '--verbose', str(VERBOSE), '--random_seed', str(RANDOM_SEED),
|
|
'--num_goals', str(GOALS), '--goal_mode', GOAL_MODE])
|
|
|
|
if not DEBUG:
|
|
wandb.init(project="hInt-RL", config=vars(args), name=MODEL_NAME)
|
|
|
|
print(*[f'\n[PARAM] {k}: {v}' for (k,v) in vars(args).items()])
|
|
print()
|
|
|
|
# Setup environment
|
|
env = ALEEnvironment(args, device=f"cuda:{GPU_DEVICE}")
|
|
input_shape = env.getStackedState().shape
|
|
print(f'Input shape {input_shape}\n')
|
|
|
|
device = torch.device(f"cuda:{GPU_DEVICE}" if torch.cuda.is_available() else "cpu")
|
|
|
|
# Setup metacontroller
|
|
metacontroller = MetaController(device=device, args=args, input_shape=input_shape, n_goals=GOALS, hidden_nodes=args.hidden_nodes)
|
|
|
|
# Setup agents and their DQN models
|
|
agent_list = []
|
|
|
|
for goal in range(sum(TRAINED_GOALS), GOALS):
|
|
print(f'Setting up agent for goal {goalExplain.get(SUBGOALs[goal])}')
|
|
|
|
agent = Agent(device=device, goal=goal, input_shape=input_shape, n_actions=len(actionMap), random_play_steps=20000, args=args)
|
|
agent_list.append(agent)
|
|
|
|
|
|
total_rewards = []
|
|
wrong_meta_pred = 0
|
|
total_steps = 0
|
|
create_video = False
|
|
|
|
for episode in range(EPISODES):
|
|
if args.verbose >= 1:
|
|
print(f'\n[Episode {episode}] Starting new episode...')
|
|
|
|
# Initialize environment and get state
|
|
env.restart()
|
|
state = env.getStackedState()
|
|
|
|
episode_steps = 0
|
|
subgoal_agent_loss = [0.0 for _ in range(GOALS)]
|
|
meta_labels = []
|
|
wrong_goal = False
|
|
goal_idx = 0
|
|
true_goal = goal_idx if GOAL_MODE == 'full_sequence' else inv_label_mapping.get(SUBGOAL_ORDER[goal_idx])
|
|
|
|
img_array = []
|
|
if episode % 10000 == 0:
|
|
create_video = True
|
|
|
|
expert_goal = np.zeros((1, GOALS))
|
|
expert_goal[0, true_goal] = 1.0
|
|
meta_labels.append((state, expert_goal)) # append, but do not collect yet
|
|
|
|
if TRAINED_GOALS[true_goal]:
|
|
goal = true_goal
|
|
# Collect sample for metacontroller because goal will be reached
|
|
metacontroller.collect(state, expert_goal)
|
|
else:
|
|
# Metacontroller predict goal
|
|
goal = metacontroller.sample(metacontroller.predict(state))
|
|
|
|
if goal!= true_goal:
|
|
wrong_goal = True
|
|
if args.verbose >= 1:
|
|
print(f"[Episode {episode}] Metacontroller predicted {goal} instead of {true_goal} as goal \U0001F47E ")
|
|
wrong_meta_pred += 1
|
|
if wrong_meta_pred % 100 == 0:
|
|
print(f'[Episode {episode}] 100 wrong meta choices. Resetting metacontroller... ')
|
|
metacontroller.reset()
|
|
else:
|
|
if args.verbose >= 1:
|
|
print(f'[Episode {episode}] Metacontroller predicted goal {goal} as new goal...')
|
|
|
|
all_step_times = []
|
|
step_time = time.time()
|
|
|
|
while not env.isTerminal() and episode_steps < MAX_EPISODE_STEPS and not wrong_goal:
|
|
|
|
# Unroll episode until agent reaches goal
|
|
while not env.trueGoalReached(SUBGOAL_ORDER[goal_idx]) and not env.isTerminal() and episode_steps < MAX_EPISODE_STEPS and not wrong_goal:
|
|
|
|
goal_position = torch.tensor(np.array([ALL_SUBGOALS[SUBGOAL_ORDER[goal_idx]]]), device=device)
|
|
|
|
if create_video:
|
|
img = env.getScreenOrig()
|
|
|
|
cv2.putText(img=img, text='goal : ' + goalExplain.get(SUBGOAL_ORDER[goal_idx]), org=(5, 205), fontFace=cv2.FONT_HERSHEY_SIMPLEX,
|
|
fontScale=0.3, color=(255, 255, 255),thickness=1)
|
|
|
|
img_array.append(img)
|
|
|
|
action = agent_list[goal].select_action(state.unsqueeze(0))
|
|
|
|
external_reward = env.act(actionMap[action])
|
|
if (external_reward != 0):
|
|
external_reward = 1.0
|
|
|
|
all_step_times.append(time.time() - step_time)
|
|
step_time = time.time()
|
|
|
|
episode_steps += 1
|
|
|
|
# Calculate intrinsic reward [optionally add distance reward]
|
|
reward = 0.0
|
|
if env.trueGoalReached(SUBGOAL_ORDER[goal_idx]):
|
|
reward += 1.0
|
|
if env.isTerminal() or episode_steps==MAX_EPISODE_STEPS:
|
|
reward -= 1.0
|
|
|
|
"""
|
|
# Simple direction reward
|
|
goal_vec = env.get_goal_direction(goal)
|
|
action_vec = actionVectors[action]
|
|
direction_reward = np.dot(action_vec, goal_vec) / 100
|
|
|
|
reward += direction_reward
|
|
"""
|
|
|
|
"""
|
|
# Distance Reward
|
|
# Query agent location every 20 steps (otherwise training is too slow)
|
|
if episode_steps % 20 == 0:
|
|
env.detect_agent()
|
|
|
|
reward += env.distanceReward(lastGoal=(true_goal-1), goal=goal)
|
|
"""
|
|
|
|
total_rewards.append(reward)
|
|
reward = torch.tensor([reward], device=device)
|
|
|
|
next_state = env.getStackedState()
|
|
|
|
# Store transition and update network parameters
|
|
agent_list[goal].remember(state, action, reward, next_state, env.isTerminal(), goal_position)
|
|
|
|
# Move to the next state
|
|
state = next_state
|
|
|
|
# Optimize the policy network
|
|
agent_loss = agent_list[goal].optimize_model()
|
|
if agent_loss is not None:
|
|
subgoal_agent_loss[goal] += agent_loss
|
|
|
|
# Update goal
|
|
if episode_steps >= MAX_EPISODE_STEPS:
|
|
subgoal_success_tracker[goal].append(0)
|
|
if args.verbose >= 1:
|
|
print(f'[Episode {episode}] Reached maximum epsiode steps: {episode_steps}. Terminate episode.')
|
|
break
|
|
|
|
elif env.trueGoalReached(SUBGOAL_ORDER[goal_idx]):
|
|
if args.verbose >= 1:
|
|
print(f'[Episode {episode}] Goal reached! \U0001F389 after step #{episode_steps}. ')
|
|
|
|
subgoal_success_tracker[goal].append(1)
|
|
subgoal_success_steps[goal].append(episode_steps)
|
|
|
|
# Predict new goal and continue if its true goal
|
|
goal_idx += 1
|
|
|
|
if goal_idx == len(SUBGOAL_ORDER):
|
|
# Finished all options --> start new episode
|
|
wrong_goal = True
|
|
print(f"[Episode {episode}] Reached all goals! \U0001F38A \U0001F38A \U0001F38A")
|
|
break
|
|
|
|
true_goal = goal_idx if GOAL_MODE == 'full_sequence' else inv_label_mapping.get(SUBGOAL_ORDER[goal_idx])
|
|
expert_goal = np.zeros((1, GOALS))
|
|
expert_goal[0, true_goal] = 1.0
|
|
meta_labels.append((state, expert_goal)) # append, but do not collect yet
|
|
|
|
# Metacontroller predict goal
|
|
goal = metacontroller.sample(metacontroller.predict(state))
|
|
|
|
if goal!= true_goal:
|
|
wrong_goal = True
|
|
if args.verbose >= 1:
|
|
print(f"[Episode {episode}] Metacontroller predicted {goal} instead of {true_goal} as goal \U0001F47E ")
|
|
wrong_meta_pred += 1
|
|
if wrong_meta_pred % 100 == 0:
|
|
if args.verbose >= 1:
|
|
print(f'[Episode {episode}] 100 wrong meta choices. Resetting metacontroller... ')
|
|
metacontroller.reset()
|
|
break
|
|
else:
|
|
# Continue with new goal
|
|
if args.verbose >= 1:
|
|
print(f'[Episode {episode}] Metacontroller predicted goal {goal} as new goal...')
|
|
|
|
else:
|
|
subgoal_success_tracker[goal].append(0)
|
|
if args.verbose >= 1:
|
|
print(f'[Episode {episode}] Agent killed after {episode_steps} steps \U0001F47E Terminate episode.')
|
|
break
|
|
|
|
if create_video and img_array:
|
|
out_path = f'runs/video_run{TIME_STAMP}-E{episode}.gif'
|
|
imageio.mimsave(out_path, img_array)
|
|
print('Saved gif to ', out_path)
|
|
create_video = False
|
|
|
|
"""END OF EPISODE"""
|
|
# At end of episode: aggregate data for metacontroller
|
|
item = meta_labels[-1]
|
|
metacontroller.collect(item[0], item[1])
|
|
|
|
# Log agent losses if agent was running
|
|
total_steps += episode_steps
|
|
avg_step_time = sum(all_step_times) / len(all_step_times) if episode_steps > 0 else 0
|
|
#print(f'[Episode {episode}] episode time: {sum(all_step_times) / 60:.2f}min average step time: {avg_step_time*100:.1f}ms')
|
|
|
|
if not DEBUG:
|
|
wandb.log({'mean_reward': np.array(total_rewards[-episode_steps:]).mean(), 'avg_step_time': avg_step_time*100, 'episode': episode})
|
|
|
|
for g in range(GOALS):
|
|
if len(subgoal_success_tracker[g]) < episode:
|
|
# goal not evaluated yet because other goals not reached
|
|
subgoal_success_tracker[g].append(0)
|
|
# agent_loss = subgoal_agent_loss[g] / episode_steps if episode_steps > 0 else 0
|
|
# log_steps = episode_steps if (subgoal_success_tracker[g] and subgoal_success_tracker[g][-1] == 1) else -episode_steps
|
|
if not DEBUG:
|
|
wandb.log({ f'sub-goal {g}': subgoal_success_tracker[g][-1], 'episode': episode}) #f'agent{g}_loss': agent_loss, f'agent{g}_steps': log_steps,
|
|
|
|
|
|
# Train metacontroller
|
|
if metacontroller.check_training_clock():
|
|
if args.verbose >= 2:
|
|
print('###################################')
|
|
print('### Training Metacontroller ###')
|
|
print('###################################')
|
|
|
|
meta_loss, meta_acc = metacontroller.train()
|
|
if not DEBUG:
|
|
wandb.log({'meta_loss': meta_loss, 'meta_acc': meta_acc, 'episode': episode})
|
|
|
|
if args.verbose >= 2:
|
|
print(f'Metacontroller loss: {meta_loss:.2f}\n')
|
|
print('###################################')
|
|
|
|
if len(subgoal_success_tracker[goal]) > 100:
|
|
subgoal_trailing_performance[goal] = sum([v for v in subgoal_success_tracker[goal][-100:] if not v is None])/ 100.0
|
|
|
|
if args.verbose >= 0:
|
|
print(f'[Episode {episode}] Subgoal trailing performance for goal {goal} is {subgoal_trailing_performance[goal]:.2f}')
|
|
print(f'[Episode {episode}] Subgoal agent {goal} steps done {agent_list[goal].steps_done}')
|
|
print(f'[Episode {episode}] Subgoal agent {goal} epsilon value is {agent_list[goal].get_epsilon():.2f}')
|
|
|
|
if subgoal_trailing_performance[goal] > STOP_TRAINING_THRESHOLD:
|
|
if args.verbose >= 1:
|
|
print(f'[Episode {episode}] Training for goal #{goal} completed...')
|
|
if not agent_list[goal].training_done:
|
|
agent_list[goal].finish_training(TIME_STAMP)
|
|
TRAINED_GOALS[goal] = True
|
|
|
|
if goal_idx == len(SUBGOAL_ORDER):
|
|
# Last goal reached trailing performance --> stop training
|
|
print(f"\nTraining completed \U0001F38A \U0001F38A \U0001F38A \n")
|
|
torch.save(metacontroller.model.state_dict(), f"runs/run{TIME_STAMP}-metacontroller.pt")
|
|
break
|
|
|
|
# After training is completed
|
|
for goal in range(GOALS):
|
|
if subgoal_success_steps[goal]:
|
|
print(f'\nAgent{goal} performance: {np.array(subgoal_success_tracker[goal]).mean():.2%}')
|
|
print(f'Reached goal {goal} {sum(subgoal_success_tracker[goal])}x with {np.array(subgoal_success_steps[goal]).mean():.0f} average steps')
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|