mirror of
https://gitlab.rlp.net/mobitar/ReCo.jl.git
synced 2024-12-21 00:51:21 +00:00
Added reward normalization
This commit is contained in:
parent
b5767f0104
commit
02739b7de6
6 changed files with 36 additions and 9 deletions
|
@ -1,7 +1,7 @@
|
||||||
name = "ReCo"
|
name = "ReCo"
|
||||||
uuid = "b25f7548-fcc9-4c91-bc24-841b54f4dd54"
|
uuid = "b25f7548-fcc9-4c91-bc24-841b54f4dd54"
|
||||||
authors = ["MoBit <mo8it@protonmail.com>"]
|
authors = ["MoBit <mo8it@protonmail.com>"]
|
||||||
version = "0.3.0"
|
version = "0.4.0"
|
||||||
|
|
||||||
[deps]
|
[deps]
|
||||||
BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf"
|
BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf"
|
||||||
|
@ -22,6 +22,7 @@ LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
|
||||||
LoopVectorization = "bdcacae8-1622-11e9-2a5c-532679323890"
|
LoopVectorization = "bdcacae8-1622-11e9-2a5c-532679323890"
|
||||||
Luxor = "ae8d54c2-7ccd-5906-9d76-62fc9837b5bc"
|
Luxor = "ae8d54c2-7ccd-5906-9d76-62fc9837b5bc"
|
||||||
MathTeXEngine = "0a4f8689-d25c-4efe-a92b-7142dfc1aa53"
|
MathTeXEngine = "0a4f8689-d25c-4efe-a92b-7142dfc1aa53"
|
||||||
|
PrettyTables = "08abe8d2-0d0c-5749-adfa-8a2ac140af0d"
|
||||||
ProfileView = "c46f51b8-102a-5cf2-8d2c-8597cb0e0da7"
|
ProfileView = "c46f51b8-102a-5cf2-8d2c-8597cb0e0da7"
|
||||||
ProgressMeter = "92933f4c-e287-5a05-a399-4b506db050ca"
|
ProgressMeter = "92933f4c-e287-5a05-a399-4b506db050ca"
|
||||||
Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
|
Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
|
||||||
|
|
|
@ -6,6 +6,7 @@ struct EnvHelperSharedProps{H<:AbstractHook}
|
||||||
hook::H
|
hook::H
|
||||||
|
|
||||||
n_steps_before_actions_update::Int64
|
n_steps_before_actions_update::Int64
|
||||||
|
n_actions_updates_per_episode::Int64
|
||||||
|
|
||||||
elliptical_a_b_ratio::Float64
|
elliptical_a_b_ratio::Float64
|
||||||
|
|
||||||
|
@ -22,6 +23,7 @@ struct EnvHelperSharedProps{H<:AbstractHook}
|
||||||
agent::Agent,
|
agent::Agent,
|
||||||
hook::H,
|
hook::H,
|
||||||
n_steps_before_actions_update::Int64,
|
n_steps_before_actions_update::Int64,
|
||||||
|
n_actions_updates_per_episode::Int64,
|
||||||
elliptical_a_b_ratio::Float64,
|
elliptical_a_b_ratio::Float64,
|
||||||
n_particles::Int64,
|
n_particles::Int64,
|
||||||
) where {H<:AbstractHook}
|
) where {H<:AbstractHook}
|
||||||
|
@ -30,6 +32,7 @@ struct EnvHelperSharedProps{H<:AbstractHook}
|
||||||
agent,
|
agent,
|
||||||
hook,
|
hook,
|
||||||
n_steps_before_actions_update,
|
n_steps_before_actions_update,
|
||||||
|
n_actions_updates_per_episode,
|
||||||
elliptical_a_b_ratio,
|
elliptical_a_b_ratio,
|
||||||
n_particles,
|
n_particles,
|
||||||
fill(0, n_particles),
|
fill(0, n_particles),
|
||||||
|
|
|
@ -165,7 +165,6 @@ function update_reward!(
|
||||||
env_helper::LocalCOMWithAdditionalShapeRewardEnvHelper,
|
env_helper::LocalCOMWithAdditionalShapeRewardEnvHelper,
|
||||||
particle::ReCo.Particle,
|
particle::ReCo.Particle,
|
||||||
)
|
)
|
||||||
normalization = env_helper.shared.n_particles # TODO: Add factor from steps
|
|
||||||
n_neighbours = env_helper.n_neighbours[particle.id]
|
n_neighbours = env_helper.n_neighbours[particle.id]
|
||||||
|
|
||||||
if n_neighbours == 0
|
if n_neighbours == 0
|
||||||
|
@ -191,7 +190,7 @@ function update_reward!(
|
||||||
)
|
)
|
||||||
end
|
end
|
||||||
|
|
||||||
env.shared.reward = reward / normalization
|
set_normalized_reward!(env, reward, env_helper)
|
||||||
end
|
end
|
||||||
|
|
||||||
return nothing
|
return nothing
|
||||||
|
|
|
@ -94,13 +94,10 @@ end
|
||||||
function update_reward!(
|
function update_reward!(
|
||||||
env::OriginEnv, env_helper::OriginEnvHelper, particle::ReCo.Particle
|
env::OriginEnv, env_helper::OriginEnvHelper, particle::ReCo.Particle
|
||||||
)
|
)
|
||||||
normalization = env_helper.shared.n_particles # TODO: Add factor from steps
|
|
||||||
|
|
||||||
reward = minimizing_reward(
|
reward = minimizing_reward(
|
||||||
env_helper.distances_to_origin[particle.id], env_helper.max_distance_to_origin
|
env_helper.distances_to_origin[particle.id], env_helper.max_distance_to_origin
|
||||||
)
|
)
|
||||||
|
set_normalized_reward!(env, reward, env_helper)
|
||||||
env.shared.reward = reward / normalization
|
|
||||||
|
|
||||||
return nothing
|
return nothing
|
||||||
end
|
end
|
18
src/RL/RL.jl
18
src/RL/RL.jl
|
@ -70,6 +70,7 @@ function run_rl(;
|
||||||
packing_ratio::Float64=0.15,
|
packing_ratio::Float64=0.15,
|
||||||
show_progress::Bool=true,
|
show_progress::Bool=true,
|
||||||
reward_discount::Float64=1.0,
|
reward_discount::Float64=1.0,
|
||||||
|
show_simulation_progress::Bool=true,
|
||||||
) where {E<:Env}
|
) where {E<:Env}
|
||||||
@assert 0.0 <= elliptical_a_b_ratio <= 1.0
|
@assert 0.0 <= elliptical_a_b_ratio <= 1.0
|
||||||
@assert n_episodes > 0
|
@assert n_episodes > 0
|
||||||
|
@ -98,8 +99,15 @@ function run_rl(;
|
||||||
|
|
||||||
hook = TotalRewardPerEpisode()
|
hook = TotalRewardPerEpisode()
|
||||||
|
|
||||||
|
n_actions_updates_per_episode = ceil(Int64, episode_duration / update_actions_at)
|
||||||
env_helper_shared = EnvHelperSharedProps(
|
env_helper_shared = EnvHelperSharedProps(
|
||||||
env, agent, hook, n_steps_before_actions_update, elliptical_a_b_ratio, n_particles
|
env,
|
||||||
|
agent,
|
||||||
|
hook,
|
||||||
|
n_steps_before_actions_update,
|
||||||
|
n_actions_updates_per_episode,
|
||||||
|
elliptical_a_b_ratio,
|
||||||
|
n_particles,
|
||||||
)
|
)
|
||||||
|
|
||||||
env_helper_args = (
|
env_helper_args = (
|
||||||
|
@ -127,7 +135,13 @@ function run_rl(;
|
||||||
agent(PRE_EPISODE_STAGE, env)
|
agent(PRE_EPISODE_STAGE, env)
|
||||||
|
|
||||||
# Episode
|
# Episode
|
||||||
ReCo.run_sim(dir; duration=episode_duration, seed=episode, env_helper=env_helper)
|
ReCo.run_sim(
|
||||||
|
dir;
|
||||||
|
duration=episode_duration,
|
||||||
|
seed=episode,
|
||||||
|
env_helper=env_helper,
|
||||||
|
show_progress=show_simulation_progress,
|
||||||
|
)
|
||||||
|
|
||||||
env.shared.terminated = true
|
env.shared.terminated = true
|
||||||
|
|
||||||
|
|
|
@ -1,3 +1,16 @@
|
||||||
function minimizing_reward(value::Float64, max_value::Float64)
|
function minimizing_reward(value::Float64, max_value::Float64)
|
||||||
return exp(-0.5 * (value / (max_value / 3))^2)
|
return exp(-0.5 * (value / (max_value / 3))^2)
|
||||||
|
end
|
||||||
|
|
||||||
|
function reward_normalization(env_helper::EnvHelper)
|
||||||
|
return env_helper.shared.n_particles * env_helper.shared.n_actions_updates_per_episode
|
||||||
|
end
|
||||||
|
|
||||||
|
function set_normalized_reward!(
|
||||||
|
env::Env, unnormalized_reward::Float64, env_helper::EnvHelper
|
||||||
|
)
|
||||||
|
normalization = reward_normalizaion(env_helper)
|
||||||
|
env.shared.reward = unnormalized_reward / normalization
|
||||||
|
|
||||||
|
return nothing
|
||||||
end
|
end
|
Loading…
Reference in a new issue