module RL export run_rl using ReinforcementLearning using Flux: InvDecay using Intervals using StaticArrays: SVector using LoopVectorization: @turbo using Random: Random using ProgressMeter: @showprogress using ..ReCo: ReCo, Particle, angle2 const INITIAL_REWARD = 0.0 struct DistanceState{L<:Bound} interval::Interval{Float64,L,Closed} function DistanceState{L}(lower::Float64, upper::Float64) where {L<:Bound} return new(Interval{Float64,L,Closed}(lower, upper)) end end struct DirectionState interval::Interval{Float64,Closed,Open} function DirectionState(lower::Float64, upper::Float64) return new(Interval{Float64,Closed,Open}(lower, upper)) end end mutable struct EnvParams action_space::Vector{Tuple{Float64,Float64}} action_ind_space::Vector{Int64} distance_state_space::Vector{DistanceState} direction_state_space::Vector{DirectionState} state_space::Vector{Union{Tuple{DistanceState,DirectionState},Tuple{Nothing,Nothing}}} state_ind_space::Vector{Int64} n_states::Int64 reward::Float64 function EnvParams( min_distance::Float64, max_distance::Float64; n_v_actions::Int64=2, n_ω_actions::Int64=3, max_v::Float64=80.0, max_ω::Float64=π / 2, n_distance_states::Int64=2, n_direction_states::Int64=2, ) @assert min_distance > 0.0 @assert max_distance > min_distance @assert n_v_actions > 1 @assert n_ω_actions > 1 @assert max_v > 0 @assert max_ω > 0 v_action_space = 0.0:(max_v / (n_v_actions - 1)):max_v ω_action_space = (-max_ω):(2 * max_ω / (n_ω_actions - 1)):max_ω n_actions = n_v_actions * n_ω_actions action_space = Vector{Tuple{Float64,Float64}}(undef, n_actions) ind = 1 for v in v_action_space for ω in ω_action_space action_space[ind] = (v, ω) ind += 1 end end action_ind_space = collect(1:n_actions) distance_range = min_distance:((max_distance - min_distance) / n_distance_states):max_distance distance_state_space = Vector{DistanceState}(undef, n_distance_states) @simd for i in 1:n_distance_states if i == 1 bound = Closed else bound = Open end distance_state_space[i] = DistanceState{bound}( distance_range[i], distance_range[i + 1] ) end direction_range = (-π):(2 * π / n_direction_states):π direction_state_space = Vector{DirectionState}(undef, n_direction_states) @simd for i in 1:n_direction_states direction_state_space[i] = DirectionState( direction_range[i], direction_range[i + 1] ) end n_states = n_distance_states * n_direction_states + 1 state_space = Vector{ Union{Tuple{DistanceState,DirectionState},Tuple{Nothing,Nothing}} }( undef, n_states ) ind = 1 for distance_state in distance_state_space for direction_state in direction_state_space state_space[ind] = (distance_state, direction_state) ind += 1 end end state_space[ind] = (nothing, nothing) state_ind_space = collect(1:n_states) return new( action_space, action_ind_space, distance_state_space, direction_state_space, state_space, state_ind_space, n_states, INITIAL_REWARD, ) end end function reset!(env_params::EnvParams) env_params.reward = INITIAL_REWARD return nothing end mutable struct Env <: AbstractEnv params::EnvParams particle::Particle state_ind::Int64 function Env(params::EnvParams, particle::Particle) # initial_state = (nothing, nothing) initial_state_ind = params.n_states return new(params, particle, initial_state_ind) end end function reset!(env::Env, particle::Particle) env.particle = particle env.state_ind = env.params.n_states return nothing end RLBase.state_space(env::Env) = env.params.state_ind_space RLBase.state(env::Env) = env.state_ind RLBase.action_space(env::Env) = env.params.action_ind_space RLBase.reward(env::Env) = env.params.reward RLBase.is_terminated(::Env) = false function gen_policy(n_states::Int64, n_actions::Int64) return QBasedPolicy(; learner=MonteCarloLearner(; approximator=TabularQApproximator(; n_state=n_states, n_action=n_actions, opt=InvDecay(1.0) ), ), explorer=EpsilonGreedyExplorer(0.1), ) end struct Params{H<:AbstractHook} envs::Vector{Env} agents::Vector{Agent} hooks::Vector{H} actions::Vector{Tuple{Float64,Float64}} env_params::EnvParams n_steps_before_actions_update::Int64 min_sq_distances::Vector{Float64} vecs_r⃗₁₂_to_min_distance_particle::Vector{SVector{2,Float64}} goal_shape_ratio::Float64 function Params{H}( n_particles::Int64, env_params::EnvParams, n_steps_before_actions_update::Int64, goal_shape_ratio::Float64, ) where {H<:AbstractHook} envs = [Env(env_params, ReCo.gen_tmp_particle()) for i in 1:n_particles] agents = [ Agent(; policy=gen_policy(env_params.n_states, length(env_params.action_space)), trajectory=VectorSARTTrajectory(), ) for i in 1:n_particles ] hooks = [H() for i in 1:n_particles] actions = Vector{Tuple{Float64,Float64}}(undef, n_particles) min_sq_distances = fill(Inf64, n_particles) vecs_r⃗₁₂_to_min_distance_particle = fill(SVector(0.0, 0.0), n_particles) return new( envs, agents, hooks, actions, env_params, n_steps_before_actions_update, min_sq_distances, vecs_r⃗₁₂_to_min_distance_particle, goal_shape_ratio, ) end end function get_env_agent_hook(rl_params::Params, ind::Int64) return (rl_params.envs[ind], rl_params.agents[ind], rl_params.hooks[ind]) end function pre_integration_hook!(rl_params::Params, n_particles::Int64) @simd for i in 1:n_particles env, agent, hook = get_env_agent_hook(rl_params, i) # Update action action_ind = agent(env) action = rl_params.env_params.action_space[action_ind] rl_params.actions[i] = action # Pre act agent(PRE_ACT_STAGE, env, action_ind) hook(PRE_ACT_STAGE, agent, env, action_ind) end @turbo for i in 1:n_particles rl_params.min_sq_distances[i] = Inf64 end return nothing end function state_hook( id1::Int64, id2::Int64, r⃗₁₂::SVector{2,Float64}, distance²::Float64, rl_params::Params ) if rl_params.min_sq_distances[id1] > distance² rl_params.min_sq_distances[id1] = distance² rl_params.vecs_r⃗₁₂_to_min_distance_particle[id1] = r⃗₁₂ end if rl_params.min_sq_distances[id2] > distance² rl_params.min_sq_distances[id2] = distance² rl_params.vecs_r⃗₁₂_to_min_distance_particle[id2] = -r⃗₁₂ end return nothing end function integration_hook!( particle::Particle, rl_params::Params, δt::Float64, si::Float64, co::Float64 ) # Apply action action = rl_params.actions[particle.id] vδt = action[1] * δt particle.tmp_c += SVector(vδt * co, vδt * si) particle.φ += action[2] * δt return nothing end function get_state_ind(state::Tuple{DistanceState,DirectionState}, env_params::EnvParams) return findfirst(x -> x == state, env_params.state_space) end function get_state_ind(::Tuple{Nothing,Nothing}, env_params::EnvParams) return env_params.n_states end function post_integration_hook( rl_params::Params, n_particles::Int64, particles::Vector{Particle}, half_box_len::Float64, ) # Update reward rl_params.env_params.reward = 1 - ( ReCo.gyration_tensor_eigvals_ratio(particles, half_box_len) - rl_params.goal_shape_ratio )^2 # Update states n_states = rl_params.env_params.n_states env_direction_state = rl_params.env_params.direction_state_space[1] for i in 1:n_particles env, agent, hook = get_env_agent_hook(rl_params, i) env_distance_state::Union{DistanceState,Nothing} = nothing min_sq_distance = rl_params.min_sq_distances[i] min_distance = sqrt(min_sq_distance) if !isinf(min_sq_distance) for distance_state in rl_params.env_params.distance_state_space if min_distance in distance_state.interval env_distance_state = distance_state break end end end if isnothing(env_distance_state) # (nothing, nothing) env.state_ind = n_states else r⃗₁₂ = rl_params.vecs_r⃗₁₂_to_min_distance_particle[i] si, co = sincos(particles[i].φ) #= Angle between two vectors e = (co, si) angle = acos(dot(r⃗₁₂, e) / (norm(r⃗₁₂) * norm(e))) norm(r⃗₁₂) == min_distance norm(e) == 1 min_distance is not infinite, because otherwise env_direction_state would be nothing and this else block will not be called =# direction = angle2(SVector(co, si), r⃗₁₂) for direction_state in rl_params.env_params.direction_state_space if direction in direction_state.interval env_direction_state = direction_state end end state = (env_distance_state, env_direction_state) env.state_ind = get_state_ind(state, env.params) end # Post act agent(POST_ACT_STAGE, env) hook(POST_ACT_STAGE, agent, env) end return nothing end function run_rl(; goal_shape_ratio::Float64, n_episodes::Int64=100, episode_duration::Float64=50.0, update_actions_at::Float64=0.2, n_particles::Int64=100, seed::Int64=42, ) @assert 0.0 <= goal_shape_ratio <= 1.0 @assert n_episodes > 0 @assert episode_duration > 0 @assert update_actions_at in 0.01:0.01:episode_duration @assert n_particles > 0 # Setup Random.seed!(seed) sim_consts = ReCo.gen_sim_consts(n_particles, 0.0; skin_to_interaction_r_ratio=3.0) n_particles = sim_consts.n_particles env_params = EnvParams(sim_consts.particle_radius, sim_consts.skin_r) n_steps_before_actions_update = round(Int64, update_actions_at / sim_consts.δt) rl_params = Params{TotalRewardPerEpisode}( n_particles, env_params, n_steps_before_actions_update, goal_shape_ratio ) # Pre experiment @simd for i in 1:n_particles env, agent, hook = get_env_agent_hook(rl_params, i) hook(PRE_EXPERIMENT_STAGE, agent, env) agent(PRE_EXPERIMENT_STAGE, env) end @showprogress 0.6 for episode in 1:n_episodes dir, particles = ReCo.init_sim_with_sim_consts(sim_consts; parent_dir="RL") # Reset @simd for i in 1:n_particles reset!(rl_params.envs[i], particles[i]) end reset!(rl_params.env_params) # Pre espisode @simd for i in 1:n_particles env, agent, hook = get_env_agent_hook(rl_params, i) hook(PRE_EPISODE_STAGE, agent, env) agent(PRE_EPISODE_STAGE, env) end # Episode ReCo.run_sim( dir; duration=episode_duration, seed=rand(1:typemax(Int64)), rl_params=rl_params ) # Post episode @simd for i in 1:n_particles env, agent, hook = get_env_agent_hook(rl_params, i) hook(POST_EPISODE_STAGE, agent, env) agent(POST_EPISODE_STAGE, env) end end # Post experiment @simd for i in 1:n_particles env, agent, hook = get_env_agent_hook(rl_params, i) hook(POST_EXPERIMENT_STAGE, agent, env) end return rl_params end end # module