Vous avez re莽u un message "Your GitLab account has been locked ..." ? Pas d'inqui茅tude : lisez cet article https://docs.gricad-pages.univ-grenoble-alpes.fr/help/unlock/

Commit 7dd32717 authored by Alexis Brenon's avatar Alexis Brenon
Browse files

馃摑 Add agent documentation

parent f0c2af01
--- Base class/interface from which inherit to implement an agent.
--- Abstract class from which inherit to implement an agent.
-- @inherit true
-- @see ArcadesComponent
-- @classmod agent.BaseAgent
-- @alias class
-- @author Alexis BRENON <alexis.brenon@imag.fr>
......@@ -11,12 +13,13 @@ local module = {}
local class, super = torch.class('BaseAgent', 'ArcadesComponent', module)
--- Abstract constructor.
-- @tparam table args
function class:__init(args)
super.__init(self, args)
end
--- Interface
-- @section interface
--- Abstract Methods
-- @section abstract-methods
-- luacheck: push no unused args
......@@ -24,7 +27,7 @@ end
-- @tparam table state The current state of the environment
-- @tparam torch.Tensor state.observation The actual observations
-- @tparam boolean state.terminal Is this state terminal?
-- @return self
-- @return `self`
-- @see environment.BaseEnvironment.observable_state
function class:integrate_observation(state)
io.stderr:write(string.format(
......@@ -50,7 +53,7 @@ end
--- Reward or punish the agent.
-- @tparam number reward Reward if positive, punishment if negative
-- @return self
-- @return `self`
function class:give_reward(reward)
error(string.format(
"%s:%s - Not implemented",
......@@ -83,10 +86,13 @@ function class:get_learned_epoch()
2)
end
--- Public Methods
-- @section public-methods
--- Put the agent in a training mode.
--
-- This is the default mode of an agent.
-- @return self
-- @return `self`
function class:training()
return self
end
......@@ -94,8 +100,8 @@ end
--- Put the agent in an evaluation mode.
--
-- This can change some internal values of the agent, like the epsilon value
-- for epsilon-greedy strategy
-- @return self
-- for epsilon-greedy strategy.
-- @return `self`
function class:evaluate()
return self
end
......@@ -108,5 +114,4 @@ function class:report()
return ""
end
return module.BaseAgent
--- DQN based agent.
-- This agent mix DNN and Q-Learning as stated in the Nature letter :
-- "Human-level control through deep reinforcement learning" (Mnih et al.)
-- This agent mix DNN and Q-Learning as stated in the Nature letter :<br/>
-- ["Human-level control through deep reinforcement learning" (Mnih et al.)](https://storage.googleapis.com/deepmind-media/dqn/DQNNaturePaper.pdf)
-- @inherit true
-- @see agent.BaseAgent
-- @see ArcadesComponent
-- @classmod agent.NeuralQLearner
-- @alias class
-- @author Alexis BRENON <alexis.brenon@imag.fr>
......@@ -15,46 +18,76 @@ assert(agent.BaseAgent)
local module = {}
local class, super = torch.class('NeuralQLearner', 'BaseAgent', module)
--- Data Types
-- @section data-types
--- Table used as arguments for the DQN @{__init|constructor}.
-- @tfield {int,...} observation_size Size of the observations from the environment `{d, w, h}`
-- @tfield table actions Available actions from the environment (`actions[0]` is `noop`)
-- @tfield[opt] ClassArgument preprocess Parameters of the network to use to preprocess states
-- @tfield ClassArgument inference Parameters of the network used for inference
-- @tfield _ExperiencePool.InitArguments experience_pool Parameters of the memory of the agent
-- @tfield[opt=0] number learn_start Number of steps after which learning starts
-- @tfield[opt=1] number update_freq Learning frequency (epoch size)
-- @tfield[opt=1] number minibatch_size Number of samples to take to learn
-- @tfield[opt=1] number n_replay Number of minibatch learning during a learning epoch
-- @tfield[opt=false] boolean rescale_r Scale rewards
-- @tfield[opt] number max_reward Reward maximum value clipping
-- @tfield[opt] number min_reward Reward minimum value clipping
-- @tfield[opt=1] number ep_start Initial value of epsilon
-- @tfield[opt=ep_start] number ep_end Final value of epsilon
-- @tfield[opt=1000000] number ep_endt Epsilon annealing time
-- @tfield[opt=0.01] number ep_eval Epsilon value when evaluating
-- @tfield[opt=0.001] number lr Learning rate
-- @tfield[opt=0.99] number discount Q-learning Discount factor (0 < x < 1)
-- @tfield[opt=nil] number clip_delta Clipping value for delta
-- @tfield[opt=nil] number target_q How long a target network is valid
-- @tfield[opt=0] number wc L2 weight cost.
-- @tfield[opt={}] RMSPropArgument rmsprop Pre-initialized RMSProp arguments.
-- @warn Some mixes between @{Dump|dump} and @{InitArguments|arguments} should be cleared...
-- @table InitArguments
--- Dump extracted from a `NeuralQLearner`.
-- All @{torch.Tensor/tensor.md/|Tensors} are converted to the
-- @{torch.default_type/utility.md#string-torchgetdefaulttensortype/|default type}
-- to avoid GPU incompatibilities.
-- @table Dump
--- Arguments used to instanciate a class.
-- @tfield string class Name of the class to instantiate
-- @tfield table params Parameters of the class (see class documentation)
-- @table ClassArgument
--- Parameters for RMSProp implementation
-- @tfield torch.Tensor/tensor.md/ mean_square Accumulated average of the squared gradient
-- @tfield torch.Tensor/tensor.md/ mean Accumulated average of the gradient
-- @tfield number decay Decay factor of the means
-- @tfield number mu Smoothing term
-- @mtodo Implement gradient descent in sub-classes?
-- @table RMSPropArgument
--- Preprocessing network.
-- This network is used to preprocess an observation from the environment
-- to change it to a simpler state
-- @tfield nn.Module/module.md/ network The actual network
-- @tfield table input_size The size of the input `{d, w, h}`
-- @tfield table output_size The size of the output `{d, w, h}`
-- @table PreprocessNetwork
--- Main deep neural network.
-- This network is used to get the best action given a history of preprocessed states
-- @tfield nn.Module/module.md/ network The actual network
-- @tfield table input_size The size of the input `{d, w, h}`
-- @tfield table output_size The size of the output `{w, h}`
-- @tfield torch.Tensor/tensor.md/ parameters Flat view of learnable parameters
-- @tfield torch.Tensor/tensor.md/ grad_parameters Flat view of gradient of energy wrt the learnable parameters
-- @table InferenceNetwork
--- @section end
--- Default constructor.
-- @tparam table args
--
-- @tparam table observation_size Size of the observations from the environment
-- @tparam table actions Available actions from the environment (actions[0] is noop)
--
-- @tparam[opt] table args.preprocess Parameters of the network to use to preprocess states
-- @tparam string args.preprocess.class Name of the class to instantiate
-- @tparam table args.preprocess.params Parameters of the class (see class documentation)
--
-- @tparam table args.inference Parameters of the network used for inference
-- @tparam string args.inference.class Name of the class to instantiate
-- @tparam table args.inference.params Parameters of the class (see class documentation)
--
-- @tparam table args.experience_pool Parameters of the memory of the agent
-- @tparam number args.experience_pool.pool_size Size of the @{agent._ExperiencePool|experience pool}
-- @tparam number args.experience_pool.history_length Length of the history for inference
-- @tparam number args.experience_pool.history_type Type of history (see @{agent._ExperiencePool})
-- @tparam number args.experience_pool.history_spacing Spacing in history (see @{agent._ExperiencePool})
--
-- @tparam[opt=0] number args.learn_start Number of steps after which learning starts
-- @tparam[opt=1] number args.update_freq Learning frequency (epoch size)
-- @tparam[opt=1] number args.minibatch_size Number of samples to take to learn
-- @tparam[opt=1] number args.n_replay Number of minibatch learning during a learning epoch
--
-- @tparam[opt=false] boolean args.rescale_r Scale rewards
-- @tparam[opt] number args.max_reward Reward maximum value clipping
-- @tparam[opt] number args.min_reward Reward minimum value clipping
--
-- @tparam[opt=1] number args.ep_start Initial value of epsilon
-- @tparam[opt=ep_start] number args.ep_end Final value of epsilon
-- @tparam[opt=1000000] number args.ep_endt Epsilon annealing time
-- @tparam[opt=0.01] number args.ep_eval Epsilon value when evaluating
--
-- @tparam[opt=0.001] number args.lr Learning rate
--
-- @tparam[opt=0.99] number args.discount Q-learning Discount factor (0 < x < 1)
-- @tparam[opt=nil] number args.clip_delta Clipping value for delta
-- @tparam[opt=nil] number args.target_q How long a target network is valid
-- @tparam[opt=0] number args.wc L2 weight cost.
-- @tparam InitArguments args
-- @tparam[opt={}] Dump dump
function class:__init(args, dump)
super.__init(self, args, dump)
if not args then
......@@ -69,7 +102,7 @@ function class:__init(args, dump)
-- This function must be called to convert tensors/network to the appropriate
-- format (CUDA or default Tensor type) to avoid computation errors caused by
-- inconsistent types
-- @tfield func self._convert_tensor
-- @tfield function self._convert_tensor
self._convert_tensor = nil
if use_gpu then
self._convert_tensor = function(t) return t:cuda() end
......@@ -79,6 +112,9 @@ function class:__init(args, dump)
end
--- Preprocessing network.
-- @tfield PreprocessNetwork self.preprocess
self.preprocess = nil
self:_init_preprocessing_network(args.preprocess, args.observation_size)
args.experience_pool = args.experience_pool or {}
......@@ -90,13 +126,16 @@ function class:__init(args, dump)
-- @tfield agent._ExperiencePool self.experience_pool
self.experience_pool = agent._ExperiencePool(args.experience_pool, dump.experience_pool)
--- Main neural network
-- @tfield InferenceNetwork self.inference
self:_init_inference_network(args.inference, dump.inference)
--- Learning frequency (epoch size).
-- @tfield number self.update_freq
self.update_freq = args.update_freq or 12
--- Number of samples to take to learn.
-- @tparam number self.minibatch_size
-- @tfield number self.minibatch_size
self.minibatch_size = args.minibatch_size or 32
--- Number of minibatch learning during a learning epoch.
-- @tfield number self.n_replay
......@@ -117,7 +156,7 @@ function class:__init(args, dump)
self.min_reward = args.min_reward
--- Maximal encountered reward for reward scaling.
-- @see self.rescale_r
-- @tparam number self.r_max
-- @tfield number self.r_max
self.r_max = 1
-- epsilon annealing
......@@ -139,7 +178,7 @@ function class:__init(args, dump)
self.ep_eval = args.ep_eval or 0
--- Learning rate.
-- @tparam number self.lr
-- @tfield number self.lr
self.lr = args.lr or 0.00025
--- Q-learning discount factor (0 < x < 1).
......@@ -155,22 +194,19 @@ function class:__init(args, dump)
-- @tfield number self.target_q
self.target_q = args.target_q or 4096
--- L2 weight cost.
-- @tparam number self.wc
-- @tfield number self.wc
self.wc = args.wc or 0
--- Number of perceived states.
-- @tparam number self.experienced_steps
-- @tfield number self.experienced_steps
self.experienced_steps = args.experienced_steps or 0
--- Number of time the agent has learned.
-- @tparam number self.learning_epoch
-- @tfield number self.learning_epoch
self.learning_epoch = args.learning_epoch or 0
--- Parameters for RMSProp implementation.
-- @tfield torch.Tensor self.rmsprop.mean_square Accumulated average of the squared gradient
-- @tfield torch.Tensor self.rmsprop.mean Accumulated average of the gradient
-- @tfield number self.rmsprop.decay Decay factor of the means
-- @tfield number self.rmsprop.mu Smoothing term
-- @table self.rmsprop
-- @tfield RMSPropArgument self.rmsprop
-- @mtodo Use dedicated classes for GD implementations.
self.rmsprop = args.rmsprop or {
mean_square = self._convert_tensor(torch.zeros(self.inference.parameters:size())),
mean = self._convert_tensor(torch.zeros(self.inference.parameters:size())),
......@@ -183,7 +219,7 @@ function class:__init(args, dump)
end
--- Current target network.
-- @tparam nn.Module self.target_network
-- @tfield nn.Container/containers.md/ self.target_network
self.target_network = args.target_network and self._convert_tensor(args.target_network) or nil
if not self.target_network and self.target_q then
self.target_network = self.inference.network:clone()
......@@ -200,7 +236,12 @@ end
--- Public Methods
-- @section public-methods
--- Overriden method
--- Integrate current observation from the environment.
-- @tparam table state The current state of the environment
-- @tparam torch.Tensor state.observation The actual observations
-- @tparam boolean state.terminal Is this state terminal?
-- @return `self`
-- @override true
-- @see BaseAgent:integrate_observation
function class:integrate_observation(state)
self.experience_pool:record_state(
......@@ -211,7 +252,9 @@ function class:integrate_observation(state)
return self
end
--- Overriden method
--- Return an action to do.
-- @treturn number Action to execute
-- @override true
-- @see BaseAgent:get_action
function class:get_action()
-- Fetch the last state with all its history
......@@ -224,7 +267,10 @@ function class:get_action()
return action_index
end
--- Overriden method
--- Reward or punish the agent.
-- @tparam number reward Reward if positive, punishment if negative
-- @return `self`
-- @override true
-- @see BaseAgent:give_reward
function class:give_reward(reward)
-- Threshold the reward
......@@ -253,19 +299,27 @@ function class:give_reward(reward)
return self
end
--- Overriden method
--- Return how many interactions the agent lived.
-- @treturn number Number of interactions done
-- @override true
-- @see BaseAgent:get_experienced_interactions
function class:get_experienced_interactions()
return self.experienced_steps
end
--- Overriden method
--- Return how many times the agent actually learned from its experience.
-- @treturn number Number of times the agent learned
-- @override true
-- @see BaseAgent:get_learned_epoch
function class:get_learned_epoch()
return self.learning_epoch
end
--- Overriden method
--- Put the agent in a training mode.
--
-- This is the default mode of an agent.
-- @return `self`
-- @override true
-- @see BaseAgent:training
function class:training()
self._logger:debug("Passing in TRAINING mode")
......@@ -274,7 +328,12 @@ function class:training()
return self
end
--- Overriden method
--- Put the agent in an evaluation mode.
--
-- The agent will used a separated @{_ExperiencePool|experience pool}
-- and a dedicated epsilon value.
-- @return `self`
-- @override true
-- @see BaseAgent:evaluate
function class:evaluate()
self._logger:debug("Passing in EVALUATING mode")
......@@ -283,8 +342,17 @@ function class:evaluate()
return self
end
--- Overriden method
-- @see BaseAgent:dump
--- Dump the agent.
--
-- Redundant informations (like network parameters) are removed
-- to save space.
-- Tensors are converted to
-- @{torch.default_type/utility.md#string-torchgetdefaulttensortype/|default type}
-- to avoid GPU incompatibilities.
-- @tparam table cycles Set of already dumped objects
-- @treturn Dump A reloadable dump
-- @override true
-- @see ArcadesComponent:dump
function class:dump(cycles)
local copy_table = require('pl.tablex').copy
local default_type = torch.getdefaulttensortype()
......@@ -307,18 +375,9 @@ end
-- @section private-methods
--- Initialize the preprocessing network.
-- @tparam[opt] table args
-- @tparam string args.class Name of the class to instantiate
-- @tparam table args.params Parameters to instantiate the class
-- @tparam table observation_size Size of input tensor
-- @tparam ClassArgument args
-- @tparam table observation_size Size of input tensor `{d, w, h}`
function class:_init_preprocessing_network(args, observation_size)
--- Preprocessing network.
-- This network is used to preprocess an observation from the environment
-- to change it to a simpler state
-- @tfield nn.Module self.preprocess.network The actual network
-- @tfield table self.preprocess.input_size The size of the input
-- @tfield table self.preprocess.output_size The size of the output
-- @table self.preprocess
local preprocess = {}
if args and args.network then
-- Reload a dumped agent
......@@ -350,22 +409,12 @@ function class:_init_preprocessing_network(args, observation_size)
end
--- Initialize the main inference network.
-- @tparam table args
-- @tparam[opt] string args.class Name of the class to instantiate
-- @tparam[opt] string args.file Path to a previously dumped network to reload
-- @tparam[opt] table args.params Parameters
-- @tparam ClassArgument args
-- @tparam[opt={}] Dump dump
function class:_init_inference_network(args, dump)
args = args or {}
-- Check that we reload a dump agent, or load a saved file or instantiate a class
assert(args.network or args.file or args.class or dump, "No network was given to the agent.")
--- Main deep neural network.
-- This network is used to get the best action given a history of preprocessed states
-- @tfield nn.Module self.inference.network The actual network
-- @tfield table self.inference.input_size The size of the input
-- @tfield table self.inference.output_size The size of the output
-- @tfield torch.Tensor self.inference.parameters Flat view of learnable parameters
-- @tfield torch.Tensor self.inference.grad_parameters Flat view of gradient of energy wrt the learnable parameters
-- @table self.inference
local network = require("arcades.network")
local inference
......@@ -406,7 +455,9 @@ function class:_init_inference_network(args, dump)
end
--- Learn from the past experiences.
-- @return self
--
-- This does nothing if agent is in @{evaluate|evaluating} mode.
-- @return `self`
function class:_learn()
if not self.evaluating and
self.experienced_steps > self.learn_start and
......@@ -422,9 +473,10 @@ function class:_learn()
return self
end
--- Get the action to execute according to epsilon-greedy policy.
--- Get the action to execute according to an epsilon-greedy policy.
-- @param state Current state
-- @treturn number Choosed action
-- @mtodo Use dedicated classes for strategies.
function class:_eGreedy(state)
if self.evaluating then
self.ep = self.ep_eval
......@@ -464,7 +516,7 @@ function class:_greedy(state)
state = self._convert_tensor(state)
local q = self.inference.network:forward(state)
q = q:reshape(q:nElement())
q = q:reshape(q:nElement()) -- get Q-values of each action given state
local maxq = q[1]
local besta = {1}
......@@ -489,7 +541,7 @@ end
-- @param args.r Reward
-- @param args.s2 Final state
-- @param args.t Is state final?
-- @treturn torch.Tensor targets Expected action-values
-- @treturn torch.Tensor/tensor.md/ Expected action-values
function class:_getQUpdate(args)
local s = self._convert_tensor(args.s)
local a = self._convert_tensor(args.a)
......@@ -552,7 +604,7 @@ function class:_getQUpdate(args)
return targets
end
--- Apply Q-Learning on a minibatch.
--- Apply Q-Learning on a @{self.minibatch_size|minibatch}.
function class:_qLearnMinibatch()
-- Perform a minibatch Q-learning update:
-- w += alpha * (r + gamma * max_a2 Q(s2,a2) - Q(s,a)) * dQ(s,a)/dw
......
--- A random agent.
-- This agent acts randomly chosing an action given an uniform probability in
-- all possible actions
-- @inherit true
-- @see agent.BaseAgent
-- @see ArcadesComponent
-- @classmod agent.RandomAgent
-- @alias class
-- @author Alexis BRENON <alexis.brenon@imag.fr>
......@@ -25,14 +28,21 @@ end
--- Public Methods
-- @section public-methods
--- Overriden method
--- Integrate current observation from the environment.
-- @tparam table state The current state of the environment
-- @tparam torch.Tensor state.observation The actual observations
-- @tparam boolean state.terminal Is this state terminal?
-- @return `self`
-- @override true
-- @see BaseAgent:integrate_observation
function class:integrate_observation(state)
self.current_is_terminal = state.terminal
return self
end
--- Overriden method
--- Return an action to do.
-- @treturn number Action to execute
-- @override true
-- @see BaseAgent:get_action
function class:get_action()
-- Select action
......@@ -46,14 +56,19 @@ end
-- luacheck: push no unused args
--- Overriden method
--- Reward or punish the agent.
-- @tparam number reward Reward if positive, punishment if negative
-- @return `self`
-- @override true
-- @see BaseAgent:give_reward
function class:give_reward(reward)
return self
end
-- luacheck: pop
--- Overriden method
--- Return how many interactions the agent lived.
-- @treturn number Number of interactions done
-- @override true
-- @see BaseAgent:get_experienced_interactions
function class:get_experienced_interactions()
return self.actions_frequencies:sum()
......@@ -61,14 +76,19 @@ end
-- luacheck: push no self
--- Overriden method
--- Return how many times the agent actually learned from its experience.
-- @treturn number `0`
-- @override true
-- @see BaseAgent:get_learned_epoch
function class:get_learned_epoch()
return 0
end
-- luacheck: pop
--- Overriden method
--- Return pretty-printed informations about the agent.
-- For this agent, it returns the actions frequecies.
-- @treturn string A human readable string about the agent
-- @override true
-- @see BaseAgent:report
function class:report()
local result = "Random Agent: actions frequencies"
......
......@@ -3,6 +3,8 @@
-- required taking account of a possible history.
-- @classmod agent._ExperiencePool
-- @alias class
-- @inherit true
-- @see ArcadesComponent
-- @author Alexis BRENON <alexis.brenon@imag.fr>
local hash = require('hash')
......@@ -15,64 +17,34 @@ local copy_table = require('pl.tablex').copy
local module = {}
local class, super = torch.class('_ExperiencePool', 'ArcadesComponent', module)
--- Attributes
-- @section attributes
--- Hash table to associate a hash (double) to a torch.Tensor representing a
-- state.
-- @usage s = self.states[self.hasher:hash(s)]
-- @field states
--- The number of elements of the hash table
-- @field hashed_states
--- Hasher object used to compute state hashes.
-- @field hasher
--- The maximal size of the pool.
-- @tfield number pool.max_size
--- The index of the last added interaction
-- @tfield number pool.last_index
--- The hashes of the recorded states.
-- @tfield table pool.states
--- The terminal signal of the states.
-- @tfield table pool.terminals
--- The action executed from the states.
-- @tfield table pool.states
--- The reward received after action.
-- @tfield table pool.rewards
--- @{push|Pushed} pools that can be restore by successive calls to @{pop}
-- @field pushed_pools
--- Number of states in a full historic state.
-- @tfield number history_length
--- Function to compute indexes of the historic state.
-- @see _compute_history_offsets
-- @tfield string history_type
--- Parameter of the @{history_type} function.
-- @see _compute_history_offsets
-- @tfield number history_spacing
--- Offsets to add to index when fetching a full historic state.
-- @tfield table history_offsets
--- Data Types
-- @section data-types
--- Table used as arguments for the ExperiencePool @{__init|constructor}.
-- @tfield number pool_size Size of the experience pool
-- @tfield table state_size Size of the states `{d, w, h}`
-- @tfield number history_length Length of the history for inference
-- @tfield string history_type Type of history
-- @tfield number history_spacing Spacing in history
-- @table InitArguments
--- Serializable dump of an `_ExperiencePool`.
-- @table Dump
--- Tables used to record interactions/experiences.
-- @tfield number max_size Maximum number of saved experiences
-- @tfield number last_index Index of the last saved experience
-- @tfield {number,...} states The hashes of the recorded states
-- @tfield {number,...} terminals Is the states terminal (`1`) or not (`0`)
-- @tfield {number,...} actions Actions executed
-- @tfield {number,...} rewards Rewards received
-- @table Pool
--- @section end
--- Default constructor.
-- @tparam table args
-- @tparam[opt=2^20 鈮 10^6] number args.pool_size Size of the experience pool
-- @tparam table args.state_size Size of the states
-- @tparam[opt=1] number args.history_length Length of a full state (with current state plus historic ones)
-- @tparam[opt="linear"] string args.history_type Function used to grab historic states (linea