From f4f5cc357112395f6ed691cb04157c1631cdacf9 Mon Sep 17 00:00:00 2001 From: carloderamo Date: Wed, 1 Aug 2018 18:18:14 +0200 Subject: [PATCH] new core added --- atari.py | 79 -------------------------------------- core.py | 104 ++++++++++++++++++++++++++++++++++++++++++++++++++ policy.py | 8 ++-- shared/dqn.py | 23 +++++------ shared/run.py | 81 ++++++++++++++++++--------------------- 5 files changed, 158 insertions(+), 137 deletions(-) delete mode 100644 atari.py create mode 100644 core.py diff --git a/atari.py b/atari.py deleted file mode 100644 index 49c3962..0000000 --- a/atari.py +++ /dev/null @@ -1,79 +0,0 @@ -import cv2 -cv2.ocl.setUseOpenCL(False) -import numpy as np - -from mushroom.environments import Atari, Environment, MDPInfo -from mushroom.utils.spaces import Box, Discrete - - -class AtariMultiple(Environment): - def __init__(self, name, width=84, height=84, ends_at_life=False, - max_pooling=True, n_steps_per_game=32): - # MPD creation - self.envs = list() - for n in name: - self.envs.append(Atari(n, width, height, ends_at_life, max_pooling)) - - max_actions = np.array([e.info.action_space.n for e in self.envs]).max() - - self._current_idx = 0 - self._current_step = 0 - self._freezed_env = False - self._learn_idx = None - self._n_steps_per_game = n_steps_per_game - self._state = [None] * len(self.envs) - - # MDP properties - action_space = Discrete(max_actions) - observation_space = Box(low=0., high=255., shape=(width, height)) - horizon = np.inf # the gym time limit is used. - gamma = .99 - mdp_info = MDPInfo(observation_space, action_space, gamma, horizon) - - super().__init__(mdp_info) - - def reset(self, state=None): - state = self.envs[self._current_idx].reset(state) - self._state[self._current_idx] = self._augment_state(state) - - return self._state[self._current_idx] - - def step(self, action): - if not self._freezed_env: - self._current_step += 1 - if self._current_step == self._n_steps_per_game: - self._current_idx += 1 - if self._current_idx == len(self.envs): - self._current_idx = 0 - self._current_step = 0 - - return self.reset(), 0, 0, {} - state, reward, absorbing, info = self.envs[ - self._current_idx].step(action) - self._state[self._current_idx] = self._augment_state(state) - - return self._state[self._current_idx], reward, absorbing, info - - def render(self, mode='human'): - self.envs[self._current_idx].render(mode=mode) - - def stop(self): - self.envs[self._current_idx].stop() - - def set_env(self, idx=None): - if idx is None: - self._current_idx = self._learn_idx - self._learn_idx = None - else: - if self._learn_idx is None: - self._learn_idx = self._current_idx - self._current_idx = idx - - def set_episode_end(self, ends_at_life): - self.envs[self._current_idx].set_episode_end(ends_at_life) - - def freeze_env(self, freeze): - self._freezed_env = freeze - - def _augment_state(self, state): - return np.array([np.array([self._current_idx]), state]) diff --git a/core.py b/core.py new file mode 100644 index 0000000..2ed8593 --- /dev/null +++ b/core.py @@ -0,0 +1,104 @@ +from tqdm import tqdm + +import numpy as np + + +class Core(object): + def __init__(self, agent, mdp, callbacks=None): + self.agent = agent + self.mdp = mdp + self._n_mdp = len(self.mdp) + self.callbacks = callbacks if callbacks is not None else list() + + self._state = [None for _ in range(self._n_mdp)] + + self._total_steps_counter = 0 + self._current_steps_counter = 0 + self._episode_steps = [None for _ in range(self._n_mdp)] + self._n_steps_per_fit = None + + def learn(self, n_steps=None, n_steps_per_fit=None, render=False, + quiet=False): + self._n_steps_per_fit = n_steps_per_fit + + fit_condition = \ + lambda: self._current_steps_counter >= self._n_steps_per_fit + + self._run(n_steps, fit_condition, render, quiet) + + def evaluate(self, n_steps=None, render=False, + quiet=False): + fit_condition = lambda: False + + return self._run(n_steps, fit_condition, render, quiet) + + def _run(self, n_steps, fit_condition, render, quiet): + move_condition = lambda: self._total_steps_counter < n_steps + + steps_progress_bar = tqdm(total=n_steps, + dynamic_ncols=True, disable=quiet, + leave=False) + + return self._run_impl(move_condition, fit_condition, steps_progress_bar, + render) + + def _run_impl(self, move_condition, fit_condition, steps_progress_bar, + render): + self._total_steps_counter = 0 + self._current_steps_counter = 0 + + dataset = list() + last = [True] * self._n_mdp + while move_condition(): + for i in range(self._n_mdp): + if last[i]: + self.reset(i) + + sample = self._step(i, render) + dataset.append(sample) + + last[i] = sample[-1] + + self._total_steps_counter += 1 + self._current_steps_counter += 1 + steps_progress_bar.update(1) + + if fit_condition(): + self.agent.fit(dataset) + self._current_episodes_counter = 0 + self._current_steps_counter = 0 + + for c in self.callbacks: + callback_pars = dict(dataset=dataset) + c(**callback_pars) + + dataset = list() + + self.agent.stop() + for i in range(self._n_mdp): + self.mdp[i].stop() + + return dataset + + def _step(self, i, render): + action = self.agent.draw_action([i, self._state[i]]) + next_state, reward, absorbing, _ = self.mdp[i].step(action) + + self._episode_steps[i] += 1 + + if render: + self.mdp[i].render() + + last = not( + self._episode_steps[i] < self.mdp[i].info.horizon and not absorbing) + + state = self._state[i] + self._state[i] = np.array(next_state) # Copy for safety reasons + + return [i, state], action, reward, [i, next_state], absorbing, last + + def reset(self, i): + self._state[i] = self.mdp[i].reset() + self.agent.episode_start(i) + self.agent.next_action = None + self._episode_steps[i] = 0 diff --git a/policy.py b/policy.py index 08a25a4..e0d2ff0 100644 --- a/policy.py +++ b/policy.py @@ -40,7 +40,7 @@ def __call__(self, *args): return probs def draw_action(self, state): - idx = np.asscalar(state[0]) + idx = state[0] state = state[1] if not np.random.uniform() < self._epsilons[idx](state): q = self._approximator.predict( @@ -65,7 +65,7 @@ def set_epsilon(self, epsilon): self._epsilons[i] = epsilon def update(self, state): - idx = np.asscalar(state[0]) + idx = state[0] self._epsilons[idx].update(state) @@ -108,7 +108,7 @@ def __call__(self, *args): return probs def draw_action(self, state): - idx = np.asscalar(state[0]) + idx = state[0] state = state[1] if not np.random.uniform() < self._epsilons[idx](state): q = self._approximator[idx].predict(state) @@ -132,5 +132,5 @@ def set_epsilon(self, epsilon): self._epsilons[i] = epsilon def update(self, state): - idx = np.asscalar(state[0]) + idx = state[0] self._epsilons[idx].update(state) diff --git a/shared/dqn.py b/shared/dqn.py index a9f1e5a..6b3e9f5 100644 --- a/shared/dqn.py +++ b/shared/dqn.py @@ -39,15 +39,15 @@ def __init__(self, approximator, policy, mdp_info, batch_size, self._entropy_coeff = entropy_coeff self._replay_memory = [ - ReplayMemory(mdp_info, initial_replay_size, max_replay_size, - history_length, dtype) for _ in range(self._n_games) + ReplayMemory(mdp_info[i], initial_replay_size, max_replay_size, + history_length, dtype) for i in range(self._n_games) ] self._buffer = [ Buffer(history_length, dtype) for _ in range(self._n_games) ] self._n_updates = 0 - self._episode_steps = 0 + self._episode_steps = [0 for _ in range(self._n_games)] self._no_op_actions = None apprx_params_train = deepcopy(approximator_params) @@ -60,7 +60,7 @@ def __init__(self, approximator, policy, mdp_info, batch_size, self.target_approximator.model.set_weights( self.approximator.model.get_weights()) - super().__init__(policy, mdp_info) + super().__init__(policy, mdp_info[np.argmax(self._n_action_per_head)]) n_samples = self._batch_size * self._n_games self._state_idxs = np.zeros(n_samples, dtype=np.int) @@ -183,25 +183,26 @@ def _next_q(self): return out_q def draw_action(self, state): - self._buffer[np.asscalar(state[0])].add(state[1]) + idx = state[0] + self._buffer[idx].add(state[1]) - if self._episode_steps < self._no_op_actions: + if self._episode_steps[idx] < self._no_op_actions: action = np.array([self._no_op_action_value]) self.policy.update(state) else: - extended_state = self._buffer[np.asscalar(state[0])].get() + extended_state = self._buffer[idx].get() - extended_state = np.array([state[0], extended_state]) + extended_state = [idx, np.array([extended_state])] action = super(DQN, self).draw_action(extended_state) - self._episode_steps += 1 + self._episode_steps[idx] += 1 return action - def episode_start(self): + def episode_start(self, idx): if self._max_no_op_actions == 0: self._no_op_actions = 0 else: self._no_op_actions = np.random.randint( self._history_length, self._max_no_op_actions + 1) - self._episode_steps = 0 + self._episode_steps[idx] = 0 diff --git a/shared/run.py b/shared/run.py index f84f069..594fcc8 100644 --- a/shared/run.py +++ b/shared/run.py @@ -12,11 +12,11 @@ sys.path.append('..') from mushroom.approximators.parametric import PyTorchApproximator -from mushroom.core import Core +from mushroom.environments import * from mushroom.utils.dataset import compute_scores from mushroom.utils.parameters import LinearDecayParameter, Parameter -from atari import AtariMultiple +from core import Core from shared.dqn import DQN from policy import EpsGreedyMultiple @@ -260,14 +260,17 @@ def regularized_loss(arg, y): else: raise ValueError + # MDP + mdp = list() + for g in args.games: + mdp.append(Atari(g, args.screen_width, args.screen_height, + ends_at_life=False)) + n_actions_per_head = [(m.info.action_space.n,) for m in mdp] + + mdp_info = [m.info for m in mdp] + # Evaluation of the model provided by the user. if args.load_path: - # MDP - mdp = AtariMultiple(args.games, args.screen_width, args.screen_height, - ends_at_life=False, - n_steps_per_game=args.batch_size) - n_actions_per_head = [(m.info.action_space.n,) for m in mdp.envs] - # Policy epsilon_test = Parameter(value=args.test_exploration_rate) pi = EpsGreedyMultiple(epsilon=epsilon_test, @@ -279,8 +282,8 @@ def regularized_loss(arg, y): approximator_params = dict( network=Network, input_shape=input_shape, - output_shape=(mdp.info.action_space.n,), - n_actions=mdp.info.action_space.n, + output_shape=(max(n_actions_per_head)[0],), + n_actions=max(n_actions_per_head)[0], n_actions_per_head=n_actions_per_head, load_path=args.load_path, optimizer=optimizer, @@ -306,7 +309,7 @@ def regularized_loss(arg, y): distill=args.distill, entropy_coeff=args.entropy_coeff ) - agent = DQN(approximator, pi, mdp.info, + agent = DQN(approximator, pi, mdp_info, approximator_params=approximator_params, **algorithm_params) # Algorithm @@ -344,11 +347,6 @@ def regularized_loss(arg, y): evaluation_frequency = args.evaluation_frequency max_steps = args.max_steps - # MDP - mdp = AtariMultiple(args.games, args.screen_width, args.screen_height, - ends_at_life=True, n_steps_per_game=args.batch_size) - n_actions_per_head = [(m.info.action_space.n,) for m in mdp.envs] - # Policy epsilon = LinearDecayParameter(value=args.initial_exploration_rate, min_value=args.final_exploration_rate, @@ -364,8 +362,8 @@ def regularized_loss(arg, y): approximator_params = dict( network=Network, input_shape=input_shape, - output_shape=(mdp.info.action_space.n,), - n_actions=mdp.info.action_space.n, + output_shape=(max(n_actions_per_head)[0],), + n_actions=max(n_actions_per_head)[0], n_actions_per_head=n_actions_per_head, optimizer=optimizer, loss=regularized_loss, @@ -391,7 +389,7 @@ def regularized_loss(arg, y): entropy_coeff=args.entropy_coeff ) - agent = DQN(approximator, pi, mdp.info, + agent = DQN(approximator, pi, mdp_info, approximator_params=approximator_params, **algorithm_params) @@ -402,33 +400,30 @@ def regularized_loss(arg, y): # Fill replay memory with random dataset print_epoch(0) - mdp.freeze_env(True) - for idx in range(len(args.games)): - mdp.set_env(idx) - pi.set_epsilon(epsilon_random) - core.learn(n_steps=initial_replay_size, - n_steps_per_fit=initial_replay_size, quiet=args.quiet) + pi.set_epsilon(epsilon_random) + core.learn(n_steps=initial_replay_size, + n_steps_per_fit=initial_replay_size, quiet=args.quiet) if args.save: agent.approximator.model.save() + for m in mdp: + m.set_episode_end(False) # Evaluate initial policy - for idx in range(len(args.games)): - mdp.set_episode_end(False) - mdp.set_env(idx) - pi.set_epsilon(epsilon_test) - dataset = core.evaluate(n_steps=test_samples, render=args.render, - quiet=args.quiet) - scores[idx].append(get_stats(dataset, idx, args.games)) + pi.set_epsilon(epsilon_test) + dataset = core.evaluate(n_steps=test_samples, render=args.render, + quiet=args.quiet) + for i in range(len(mdp)): + d = dataset[i::len(mdp)] + scores[i].append(get_stats(d, i, args.games)) np.save(folder_name + '/scores.npy', scores) for n_epoch in range(1, max_steps // evaluation_frequency + 1): print_epoch(n_epoch) print('- Learning:') # learning step - mdp.freeze_env(False) - mdp.set_episode_end(True) - mdp.set_env(None) + for m in mdp: + m.set_episode_end(True) pi.set_epsilon(None) core.learn(n_steps=evaluation_frequency, n_steps_per_fit=train_frequency, quiet=args.quiet) @@ -438,14 +433,14 @@ def regularized_loss(arg, y): print('- Evaluation:') # evaluation step - mdp.freeze_env(True) - for idx in range(len(args.games)): - mdp.set_episode_end(False) - mdp.set_env(idx) - pi.set_epsilon(epsilon_test) - dataset = core.evaluate(n_steps=test_samples, - render=args.render, quiet=args.quiet) - scores[idx].append(get_stats(dataset, idx, args.games)) + for m in mdp: + m.set_episode_end(False) + pi.set_epsilon(epsilon_test) + dataset = core.evaluate(n_steps=test_samples, + render=args.render, quiet=args.quiet) + for i in range(len(mdp)): + d = dataset[i::len(mdp)] + scores[i].append(get_stats(d, i, args.games)) np.save(folder_name + '/scores.npy', scores)