From 28fd6bab95007a85745a2ab8318dead59a999bfa Mon Sep 17 00:00:00 2001
From: Mo8it <mo8it@protonmail.com>
Date: Sat, 15 Jan 2022 18:55:01 +0100
Subject: [PATCH] Fix reward

---
 README.adoc           |  2 +-
 src/RL/LocalCOMEnv.jl | 19 ++++++++-----------
 src/RL/RL.jl          |  8 ++++----
 3 files changed, 13 insertions(+), 16 deletions(-)

diff --git a/README.adoc b/README.adoc
index a996374..e198e64 100644
--- a/README.adoc
+++ b/README.adoc
@@ -2,4 +2,4 @@
 
 image:https://img.shields.io/badge/code%20style-blue-4495d1.svg[Code Style: Blue, link=https://github.com/invenia/BlueStyle]
 
-**Re**inforcement learning of **co**llective behaviour.
\ No newline at end of file
+**Re**inforcement learning of **co**llective behavior.
diff --git a/src/RL/LocalCOMEnv.jl b/src/RL/LocalCOMEnv.jl
index 8c2fef3..66eb076 100644
--- a/src/RL/LocalCOMEnv.jl
+++ b/src/RL/LocalCOMEnv.jl
@@ -156,11 +156,7 @@ function state_update_hook(env_helper::LocalCOMEnvHelper, particles::Vector{Part
         distance_to_local_center_of_mass_sum / n_particles
     env_helper.add_shape_reward_term =
         mean_distance_to_local_center_of_mass /
-        env_helper.max_distance_to_local_center_of_mass < 0.32
-
-    if env_helper.add_shape_reward_term
-        #println(mean_distance_to_local_center_of_mass / env_helper.max_distance_to_local_center_of_mass) # TODO: Remove
-    end
+        env_helper.max_distance_to_local_center_of_mass < 0.3
 
     env_helper.center_of_mass = ReCo.center_of_mass(particles, env_helper.half_box_len)
 
@@ -180,7 +176,11 @@ end
 Returns the reward such that it is 0 for value=max_value and 1 for value=0.
 """
 function minimizing_reward(value::Float64, max_value::Float64)
-    return (max_value - value) / (max_value + value)
+    if value > max_value
+        error("value > max_value")
+    end
+
+    return ((max_value - value) / (max_value + value))^2
 end
 
 function update_reward!(env::LocalCOMEnv, env_helper::LocalCOMEnvHelper, particle::Particle)
@@ -207,12 +207,9 @@ function update_reward!(env::LocalCOMEnv, env_helper::LocalCOMEnvHelper, particl
                 env_helper.half_box_len,
             )
 
-            reward += unnormalized_reward(
-                elliptical_distance,
-                env_helper.max_elliptical_distance, # TODO: Fix sq
+            reward += minimizing_reward(
+                elliptical_distance, env_helper.max_elliptical_distance
             )
-
-            # println(elliptical_distance / env_helper.max_elliptical_distance) # TODO: Remove
         end
 
         env.shared.reward = reward / normalization
diff --git a/src/RL/RL.jl b/src/RL/RL.jl
index 7f185b6..d0378e7 100644
--- a/src/RL/RL.jl
+++ b/src/RL/RL.jl
@@ -25,8 +25,8 @@ include("Hooks.jl")
 
 function gen_agent(n_states::Int64, n_actions::Int64, ϵ_stable::Float64)
     # TODO: Optimize warmup and decay
-    warmup_steps = 200_000
-    decay_steps = 1_000_000
+    warmup_steps = 500_000
+    decay_steps = 5_000_000
 
     policy = QBasedPolicy(;
         learner=MonteCarloLearner(;
@@ -135,8 +135,8 @@ function run_rl(;
         agent(POST_EPISODE_STAGE, env)
 
         # TODO: Replace with live plot
-        display(hook.rewards)
-        display(agent.policy.explorer.step)
+        @show hook.rewards
+        @show agent.policy.explorer.step
     end
 
     # Post experiment