From 28fd6bab95007a85745a2ab8318dead59a999bfa Mon Sep 17 00:00:00 2001 From: Mo8it Date: Sat, 15 Jan 2022 18:55:01 +0100 Subject: [PATCH] Fix reward --- README.adoc | 2 +- src/RL/LocalCOMEnv.jl | 19 ++++++++----------- src/RL/RL.jl | 8 ++++---- 3 files changed, 13 insertions(+), 16 deletions(-) diff --git a/README.adoc b/README.adoc index a996374..e198e64 100644 --- a/README.adoc +++ b/README.adoc @@ -2,4 +2,4 @@ image:https://img.shields.io/badge/code%20style-blue-4495d1.svg[Code Style: Blue, link=https://github.com/invenia/BlueStyle] -**Re**inforcement learning of **co**llective behaviour. \ No newline at end of file +**Re**inforcement learning of **co**llective behavior. diff --git a/src/RL/LocalCOMEnv.jl b/src/RL/LocalCOMEnv.jl index 8c2fef3..66eb076 100644 --- a/src/RL/LocalCOMEnv.jl +++ b/src/RL/LocalCOMEnv.jl @@ -156,11 +156,7 @@ function state_update_hook(env_helper::LocalCOMEnvHelper, particles::Vector{Part distance_to_local_center_of_mass_sum / n_particles env_helper.add_shape_reward_term = mean_distance_to_local_center_of_mass / - env_helper.max_distance_to_local_center_of_mass < 0.32 - - if env_helper.add_shape_reward_term - #println(mean_distance_to_local_center_of_mass / env_helper.max_distance_to_local_center_of_mass) # TODO: Remove - end + env_helper.max_distance_to_local_center_of_mass < 0.3 env_helper.center_of_mass = ReCo.center_of_mass(particles, env_helper.half_box_len) @@ -180,7 +176,11 @@ end Returns the reward such that it is 0 for value=max_value and 1 for value=0. """ function minimizing_reward(value::Float64, max_value::Float64) - return (max_value - value) / (max_value + value) + if value > max_value + error("value > max_value") + end + + return ((max_value - value) / (max_value + value))^2 end function update_reward!(env::LocalCOMEnv, env_helper::LocalCOMEnvHelper, particle::Particle) @@ -207,12 +207,9 @@ function update_reward!(env::LocalCOMEnv, env_helper::LocalCOMEnvHelper, particl env_helper.half_box_len, ) - reward += unnormalized_reward( - elliptical_distance, - env_helper.max_elliptical_distance, # TODO: Fix sq + reward += minimizing_reward( + elliptical_distance, env_helper.max_elliptical_distance ) - - # println(elliptical_distance / env_helper.max_elliptical_distance) # TODO: Remove end env.shared.reward = reward / normalization diff --git a/src/RL/RL.jl b/src/RL/RL.jl index 7f185b6..d0378e7 100644 --- a/src/RL/RL.jl +++ b/src/RL/RL.jl @@ -25,8 +25,8 @@ include("Hooks.jl") function gen_agent(n_states::Int64, n_actions::Int64, ϵ_stable::Float64) # TODO: Optimize warmup and decay - warmup_steps = 200_000 - decay_steps = 1_000_000 + warmup_steps = 500_000 + decay_steps = 5_000_000 policy = QBasedPolicy(; learner=MonteCarloLearner(; @@ -135,8 +135,8 @@ function run_rl(; agent(POST_EPISODE_STAGE, env) # TODO: Replace with live plot - display(hook.rewards) - display(agent.policy.explorer.step) + @show hook.rewards + @show agent.policy.explorer.step end # Post experiment