Add notations

xlnwel · Oct 22, 2019 · 6b07e01 · 6b07e01
1 parent 94c6246
commit 6b07e01
Show file tree

Hide file tree

Showing 16 changed files with 93 additions and 58 deletions.
diff --git a/algo/ddqn/agent.py b/algo/ddqn/agent.py
@@ -33,7 +33,6 @@ def __init__(self,
         # hyperparameters
         self.gamma = args['gamma'] if 'gamma' in args else .99
         self.update_freq = args['update_freq']
-        self.loss_type = args['loss_type']
         self.target_update_freq = args['target_update_freq']
         self.update_step = 0
 
@@ -84,7 +83,7 @@ def __init__(self,
                          device=device)
 
         # learing rate schedule
-        decay_duration = float(self.args['max_steps']) / 10 # 2e7
+        decay_duration = float(self.args['max_steps'])
         lr = float(self.args['Qnets']['learning_rate'])
         end_lr = float(self.args['Qnets']['end_lr'])
         self.lr_schedule = PiecewiseSchedule([(0, lr), (decay_duration / 8, lr), (decay_duration / 4,  end_lr)],
@@ -297,7 +296,7 @@ def _loss(self):
                                     self.Qnets.Q_next_target, self.gamma, self.data['steps'])
 
         with tf.name_scope('loss'):
-            loss_func = huber_loss if self.loss_type == 'huber' else tf.square
+            loss_func = huber_loss if self.args['loss_type'] == 'huber' else tf.square
             if self.buffer_type == 'proportional':
                 loss = tf.reduce_mean(self.data['IS_ratio'][:, None] * loss_func(Q_error), name='loss')
             else:

diff --git a/algo/ddqn/args.yaml b/algo/ddqn/args.yaml
@@ -39,6 +39,7 @@ agent:
         learning_rate: 1.e-4
         end_lr: 5.e-5
         clip_norm: 10
+        epsilon: 0.0003125
     env_stats:
         times: 1
         stats: [score, score_mean, score_std, score_best, epslen_mean, epslen_std]

diff --git a/algo/ddqn/base_net.py b/algo/ddqn/base_net.py
@@ -54,7 +54,7 @@ def _head_net(self, x, out_dim, name=None):
         def net(x, out_dim):
             layer = self.noisy if self.use_noisy else self.dense
             name_fn = lambda i: f'noisy_{i}' if self.use_noisy else f'dense_{i}'
-            x = layer(x, 512, name=name_fn(1))
+            x = layer(x, 256, name=name_fn(1))
             x = norm_activation(x, norm=self.dense_norm, activation=tf.nn.relu)
             x = layer(x, out_dim, name=name_fn(2))
             return x

diff --git a/algo/rainbow/agent.py b/algo/rainbow/agent.py
@@ -58,15 +58,18 @@ def _loss(self):
 
             Tz = n_step_target(self.data['reward'], self.data['done'], 
                                 z_support[None, :], self.gamma, self.data['steps'])     # [B, N]
+            # compute projection
             Tz = tf.clip_by_value(Tz, v_min, v_max)[:, None, :]                         # [B, 1, N]
-            z_original = z_support[None, :, None]                                       # [1, N, 1]
+            z_support = z_support[None, :, None]                                        # [1, N, 1]
 
-            weight = tf.clip_by_value(1. - tf.abs(Tz - z_original) / delta_z, 0, 1)     # [B, N, N]
-            dist_target = tf.reduce_sum(weight * self.Qnets.dist_next_target, axis=2)   # [B, N]
+            # quotient = `[1 - numerator / (\Delta z)]_0^1` in Eq7.
+            quotient = tf.clip_by_value(1. - tf.abs(Tz - z_support) / delta_z, 0, 1)    # [B, N, N]
+            dist_next = tf.expand_dims(self.Qnets.dist_next_target, 1)                  # [B, 1, N]
+            dist_target = tf.reduce_sum(quotient * dist_next, axis=2)                   # [B, N]
             dist_target = tf.stop_gradient(dist_target)
 
             kl_loss = tf.nn.softmax_cross_entropy_with_logits(labels=dist_target, logits=self.Qnets.logits)
-            loss = tf.reduce_mean(kl_loss, name='loss')
+            loss = tf.reduce_mean(self.data['IS_ratio'] * kl_loss, name='loss')
 
         with tf.name_scope('priority'):
             priority = self._rescale(kl_loss)
@@ -80,5 +83,4 @@ def _log_info(self):
                     if self.buffer_type == 'proportional':
                         stats_summary('priority', self.priority, max=True, std=True)
                     stats_summary('Q', self.Qnets.Q, max=True)
-                    stats_summary('prob', self.Qnets.dist, max=True)
                     tf.summary.scalar('loss_', self.loss)
diff --git a/algo/rainbow/args.yaml b/algo/rainbow/args.yaml
@@ -41,6 +41,7 @@ agent:
         learning_rate: 1.e-4
         end_lr: 5.e-5
         clip_norm: 10
+        epsilon: 0.0003125
     env_stats:
         times: 1
         stats: [score, score_mean, score_std, score_best, epslen_mean, epslen_std]

diff --git a/algo/rainbow/network.py b/algo/rainbow/network.py
@@ -40,7 +40,7 @@ def _build_graph(self):
         self.best_action = select_action(Qs, 'best_action')
         next_action = select_action(Qs_next, 'next_action')
 
-        # [B, 1, N], [B, 1, N]
+        # [B, N], [B, N]
         self.logits = self._c51_action_value(self.action, logits, 'logits')
         self.dist_next_target = self._c51_action_value(next_action, dist_next_target, 'dist_next_target')
         # for tensorboard bookkeeping
@@ -73,6 +73,5 @@ def _c51_action_value(self, action, values, name):
         with tf.name_scope(name):
             action = action[..., None]
             value = tf.gather_nd(values, action, batch_dims=1)
-            value = tf.expand_dims(value, axis=1)
 
-        return value
+        return value
diff --git a/algo/rainbow_iqn/agent.py b/algo/rainbow_iqn/agent.py
@@ -23,7 +23,6 @@ def __init__(self,
                  log_stats=False, 
                  device=None,
                  reuse=None):
-
         super().__init__(name, args, 
                          env_args, buffer_args,
                          sess_config=sess_config, 
@@ -52,18 +51,17 @@ def _loss(self):
         def tiled_n_step_target():
             n_quantiles = self.args['Qnets']['N_prime']
 
-            reward_tiled = tf.reshape(tf.tile(self.data['reward'], [n_quantiles, 1]),
-                                      [n_quantiles, -1, 1])
-            done_tiled = tf.reshape(tf.tile(self.data['done'], [n_quantiles, 1]),
-                                      [n_quantiles, -1, 1])
-            steps_tiled = tf.reshape(tf.tile(self.data['steps'], [n_quantiles, 1]),
-                                      [n_quantiles, -1, 1])
-            return n_step_target(reward_tiled, done_tiled, 
+            reward = self.data['reward'][None, ...]
+            done = self.data['done'][None, ...]
+            steps = self.data['steps'][None, ...]
+            return n_step_target(reward, done, 
                                  self.Qnets.quantile_values_next_target,
-                                 self.gamma, steps_tiled)
-
+                                 self.gamma, steps)
+            
         def quantile_regression_loss(u):
+            # [B, N, N']
             abs_part = tf.abs(self.Qnets.quantiles - tf.where(u < 0, tf.ones_like(u), tf.zeros_like(u)))
+
             huber = huber_loss(u, delta=self.args['Qnets']['delta'])
 
             qr_loss = tf.reduce_sum(tf.reduce_mean(abs_part * huber, axis=2), axis=1)   # [B]
@@ -75,10 +73,10 @@ def quantile_regression_loss(u):
                                     self.Qnets.Q_next_target, self.gamma, self.data['steps'])
 
         with tf.name_scope('loss'):
-            quantile_values_target = tiled_n_step_target()
+            quantile_values_target = tiled_n_step_target()                              # [N', B, 1]
             quantile_values_target = tf.transpose(quantile_values_target, [1, 2, 0])    # [B, 1, N']
             quantile_values = tf.transpose(self.Qnets.quantile_values, [1, 0, 2])       # [B, N, 1]
-            quantile_error = tf.abs(quantile_values - quantile_values_target)
+            quantile_error = quantile_values_target - quantile_values                   # [B, N, N']
 
             loss = quantile_regression_loss(quantile_error)
 

diff --git a/algo/rainbow_iqn/args.yaml b/algo/rainbow_iqn/args.yaml
@@ -24,7 +24,6 @@ agent:
     frame_history_len: 4
     update_freq: 4
     n_steps: 3
-    loss_type: huber   # huber or mse
     target_update_freq: 8000    # we count update step, which is 4 times frame steps
 
     Qnets:
@@ -43,6 +42,7 @@ agent:
         learning_rate: 1.e-4
         end_lr: 5.e-5
         clip_norm: 10
+        epsilon: 0.0003125
     env_stats:
         times: 1
         stats: [score, score_mean, score_std, score_best, epslen_mean, epslen_std]

diff --git a/algo/rainbow_iqn/network.py b/algo/rainbow_iqn/network.py
@@ -57,27 +57,33 @@ def _build_graph(self):
 
         # quantile_values for regression loss
         # Q for priority required by PER
-        self.quantile_values, self.Q = self._iqn_values(self.action, self.N, quantile_values, Qs)
+        # [N, B, 1], [B, 1]
+        self.quantile_values, self.Q = self._iqn_values(self.action, self.N, quantile_values, Qs, 'quantile_q')
+        # [N', B, 1], [B, 1]
         self.quantile_values_next_target, self.Q_next_target = self._iqn_values(next_action, 
                                                                                 self.N_prime,
                                                                                 quantile_values_next_target, 
-                                                                                Qs_next_target)
+                                                                                Qs_next_target,
+                                                                                'next_quantile_q')
 
     def _iqn_net(self, x, n_quantiles, batch_size, out_dim, 
                 psi_net, phi_net, f_net, name, reuse=None):
         quantile_embedding_dim = self.args['quantile_embedding_dim']
 
         with tf.variable_scope(name, reuse=reuse):
             # 𝜓 function in the paper
+            # [N*B, H]
             x_tiled = psi_net(x, n_quantiles)
 
             h_dim = x_tiled.shape.as_list()[1]
 
             # 𝜙 function in the paper
+            # [B, N, 1], [N*B, H]
             quantiles, x_quantiles = phi_net(n_quantiles, batch_size, quantile_embedding_dim, h_dim)
             # Combine outputs of psi and phi
-            y = x_tiled * x_quantiles
+            y = x_tiled * x_quantiles   # [N*B, H]
             # f function in the paper
+            # [N, B, O], [B, O]
             v_qv, v = f_net(y, 1, n_quantiles, batch_size, name='value_net')
             a_qv, a = f_net(y, out_dim, n_quantiles, batch_size, name='adv_net')
 
@@ -90,7 +96,7 @@ def _iqn_net(self, x, n_quantiles, batch_size, out_dim,
     def _psi_net(self, x, n_quantiles):
         with tf.variable_scope('psi_net'):
             x = self._conv_net(x)
-            x_tiled = tf.tile(x, [n_quantiles, 1])
+            x_tiled = tf.tile(x, [n_quantiles, 1])      # [N*B, H]
 
         return x_tiled
 
@@ -99,34 +105,36 @@ def _phi_net(self, n_quantiles, batch_size, quantile_embedding_dim, h_dim):
             quantile_shape = [n_quantiles * batch_size, 1]
             quantiles = tf.random.uniform(quantile_shape, minval=0, maxval=1)       # [N*B, 1]
             quantiles_tiled = tf.tile(quantiles, [1, quantile_embedding_dim])       # [N*B, D]
-            # returned quantiles for computing quantile regression loss
+            # returned quantiles for computing quantile regression loss, [B, N, 1]
             quantiles_reformed = tf.transpose(tf.reshape(quantiles, [n_quantiles, batch_size, 1]), [1, 0, 2])
 
         with tf.variable_scope('phi_net'):
             pi = tf.constant(np.pi)
             degrees = tf.cast(tf.range(quantile_embedding_dim), tf.float32) * pi * quantiles_tiled
             x_quantiles = tf.cos(degrees)
             x_quantiles = tf.layers.dense(x_quantiles, h_dim)
-            x_quantiles = tf.nn.relu(x_quantiles)
+            x_quantiles = tf.nn.relu(x_quantiles)       # [N*B, H]
 
         return quantiles_reformed, x_quantiles
 
     def _f_net(self, x, out_dim, n_quantiles, batch_size, name=None):
         name = f'{name}_f_net' if name else 'f_net'
         with tf.variable_scope(name):
-            quantile_values = self._head_net(x, out_dim)
-            quantile_values = tf.reshape(quantile_values, (n_quantiles, batch_size, out_dim))
-            q = tf.reduce_mean(quantile_values, axis=0)
+            quantile_values = self._head_net(x, out_dim)        # [N*B, O]
+            quantile_values = tf.reshape(quantile_values, (n_quantiles, batch_size, out_dim))   # [N, B, O]
+            q = tf.reduce_mean(quantile_values, axis=0)         # [B, O]
 
         return quantile_values, q
 
-    def _iqn_values(self, action, n_quantiles, quantile_values, Qs):
-        with tf.name_scope('action_values'):
-            action_tiled = tf.reshape(tf.tile(action, [n_quantiles]), 
+    def _iqn_values(self, action, n_quantiles, quantile_values, Qs, name):
+        with tf.name_scope(name):
+            action_tiled = tf.reshape(tf.tile(action, [n_quantiles]),
                                         [n_quantiles, -1])
             quantile_values = tf.reduce_sum(tf.one_hot(action_tiled, self.n_actions)
-                                            * quantile_values, axis=2, keepdims=True)
+                                            * quantile_values, axis=2, keepdims=True,
+                                            name='quantile_values')
             q = tf.reduce_sum(tf.one_hot(action, self.n_actions)
-                              * Qs, axis=1, keepdims=True)
+                              * Qs, axis=1, keepdims=True,
+                              name='q')
 
         return quantile_values, q
diff --git a/algo/single_train.py b/algo/single_train.py
@@ -38,8 +38,8 @@ def train(agent):
     start_time = time.time()
     while step <= max_steps:
         train_duration, (scores, epslens, step, itr) = timeit(agent.train, step, itr)
-        pwc(f'Training Duration: {train_duration:2f}s\n', 'blue')
-        pwc(f'Average Score: {np.mean(scores):2f}\n', 'blue')
+        pwc(f'Training Duration: {train_duration:2f}s', 'blue')
+        pwc(f'Average Score: {np.mean(scores):2f}', 'blue')
         pwc(f'Average Epslen: {np.mean(epslens):2f}', 'blue')
 
         eval_duration, (eval_scores, eval_epslens) = timeit(agent.eval)
@@ -53,10 +53,10 @@ def train(agent):
         epslen_mean = np.mean(eval_epslens)
         epslen_std = np.mean(eval_epslens)
 
-        if score_best_mean > score_mean:
+        if score_mean > score_best_mean:
             score_best_mean = score_mean
             agent.save()
-            
+
         agent.record_stats(global_step=step, score=score, score_mean=score_mean, 
                             score_std=score_std, score_best=score_best,
                             epslen_mean=epslen_mean, epslen_std=epslen_std)
@@ -67,7 +67,6 @@ def train(agent):
             'Episode': itr,
             'TimeElapsed': f'{time.time() - start_time:.2f}s',
             'Score': score,
-            'TrainScoreMean': np.mean(agent.env.get_episode_rewards()[-20:]),
             'ScoreMean': score_mean,
             'ScoreStd': score_std,
             'ScoreBest': score_best,
@@ -101,9 +100,16 @@ def main(env_args, agent_args, buffer_args, render=False, restore=False):
 
     agent_args['env_stats']['times'] = 1
     sess_config = get_sess_config(2)
-    agent = Agent('Agent', agent_args, env_args, buffer_args, 
-                  save=True, log=True, log_tensorboard=True, 
-                  log_stats=True, log_params=False)
+    agent = Agent('Agent', 
+                  agent_args, 
+                  env_args, 
+                  buffer_args,
+                  sess_config=sess_config, 
+                  save=True, 
+                  log=True, 
+                  log_tensorboard=True, 
+                  log_stats=True, 
+                  log_params=False)
 
     if restore:
         agent.restore()

diff --git a/basic_model/layer.py b/basic_model/layer.py
@@ -247,11 +247,11 @@ def fixup_residual(self, x, norm=None, name=None):
 
         y = x
         with tf.variable_scope(name):
-            y = tf.nn.relu(y)
+            y = tf_utils.norm_activation(y, norm=norm, activation=tf.nn.relu)
             y = y + get_bias('bias1')
             y = conv(y)
             y = y + get_bias('bias2')
-            y = tf.nn.relu(y)
+            y = tf_utils.norm_activation(y, norm=norm, activation=tf.nn.relu)
             y = y + get_bias('bias3')
             y = conv(y)
             y = y * get_scale() + get_bias('bias4')
@@ -266,8 +266,8 @@ def net_fn(x):
                 with tf.variable_scope(f'block_{i}_{filters}'):
                     x = self.conv_norm_activation(x, filters, 3, padding='same', norm=conv_norm)
                     x = tf.layers.max_pooling2d(x, 3, 2, padding='same')
-                    x = residual(x, f'residual_1')
-                    x = residual(x, f'residual_2')
+                    x = residual(x, name='residual_1')
+                    x = residual(x, name='residual_2')
 
             x = tf.nn.relu(x)
             x = tf.layers.flatten(x)

diff --git a/env/atari_wrappers.py b/env/atari_wrappers.py
@@ -291,9 +291,9 @@ def wrap_deepmind(env, episode_life=True, clip_rewards=True, frame_stack=False,
 
 def make_deepmind_atari(args):
     env = make_atari(args['name'])
-    if 'log_video' in args and args['log_video']:
+    if args['log_video']:
         # put monitor in middle to properly record episodic information
-        env = gym.wrappers.Monitor(env, args['video_path'])
+        env = gym.wrappers.Monitor(env, args['video_path'], force=True)
     env = wrap_deepmind(env, args['episode_life'])
 
     return env
diff --git a/readme.md b/readme.md
@@ -28,6 +28,15 @@ Episodic rewards averaged over 100 episodes **at training time**.
   <figcaption></figcaption>
 </figure>
 
+Compare to [Google's Dopamine](https://github.com/google/dopamine) shown below, our implementation manages to achieving better performance on Breakout
+
+<figure>
+  <img src="results/dopamine-BreakoutNoFrameskip.png" alt="" width="1000">
+  <figcaption></figcaption>
+</figure>
+
+Source: https://google.github.io/dopamine/baselines/plots.html, each iteration stands for 250000 steps.
+
 ## Running
 
 ```shell

diff --git a/results/dopamine-breakout.png b/results/dopamine-breakout.png
diff --git a/run/grid_search.py b/run/grid_search.py
@@ -8,14 +8,15 @@
 
 
 class GridSearch:
-    def __init__(self, args_file, train_func, render=False, n_trials=1, dir_prefix=''):
+    def __init__(self, args_file, train_func, render=False, n_trials=1, sub_process=False, dir_prefix=''):
         args = load_args(args_file)
         self.env_args = args['env']
         self.agent_args = args['agent']
         self.buffer_args = args['buffer'] if 'buffer' in args else {}
         self.train_func = train_func
         self.render = render
         self.n_trials = n_trials
+        self.sub_process = sub_process
         self.dir_prefix = dir_prefix
 
         self.processes = []
@@ -24,7 +25,14 @@ def __call__(self, **kwargs):
         self._dir_setup()
         if kwargs == {} and self.n_trials == 1:
             # if no argument is passed in, run the default setting
-            self.train_func(self.env_args, self.agent_args, self.buffer_args, self.render)        
+            if self.sub_process:
+                p = Process(target=self.train_func, 
+                            args=(self.env_args, self.agent_args, self.buffer_args, self.render))
+                p.start()
+                time.sleep(1)
+                self.processes.append(p)
+            else:
+                self.train_func(self.env_args, self.agent_args, self.buffer_args, self.render)        
         else:
             # do grid search
             self.agent_args['model_name'] = 'GS'