improve performance

TTSArrows · Jan 28, 2018 · b484df7 · b484df7
1 parent 04d9f09
commit b484df7
Show file tree

Hide file tree

Showing 2 changed files with 16 additions and 17 deletions.
diff --git a/contents/9_Deep_Deterministic_Policy_Gradient_DDPG/DDPG.py b/contents/9_Deep_Deterministic_Policy_Gradient_DDPG/DDPG.py
@@ -97,7 +97,7 @@ def add_grad_to_graph(self, a_grads):
         with tf.variable_scope('policy_grads'):
             # ys = policy;
             # xs = policy's parameters;
-            # self.a_grads = the gradients of the policy to get more Q
+            # a_grads = the gradients of the policy to get more Q
             # tf.gradients will calculate dys/dxs with a initial gradients for ys, so this is dq/da * da/dparams
             self.policy_grads = tf.gradients(ys=self.a, xs=self.e_params, grad_ys=a_grads)
 

diff --git a/contents/9_Deep_Deterministic_Policy_Gradient_DDPG/DDPG_update2.py b/contents/9_Deep_Deterministic_Policy_Gradient_DDPG/DDPG_update2.py
@@ -1,4 +1,8 @@
 """
+Note: This is a updated version from my previous code,
+for the target network, I use moving average to soft replace target parameters instead using assign function.
+By doing this, it has 20% speed up on my machine (CPU).
+
 Deep Deterministic Policy Gradient (DDPG), Reinforcement Learning.
 DDPG is Actor Critic based algorithm.
 Pendulum example.
@@ -45,29 +49,25 @@ def __init__(self, a_dim, s_dim, a_bound,):
         self.S_ = tf.placeholder(tf.float32, [None, s_dim], 's_')
         self.R = tf.placeholder(tf.float32, [None, 1], 'r')
 
-        ema = tf.train.ExponentialMovingAverage(decay=1 - TAU)
+        self.a = self._build_a(self.S,)
+        q = self._build_c(self.S, self.a, )
+        a_params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='Actor')
+        c_params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='Critic')
+        ema = tf.train.ExponentialMovingAverage(decay=1 - TAU)          # soft replacement
 
         def ema_getter(getter, name, *args, **kwargs):
             return ema.average(getter(name, *args, **kwargs))
 
-        self.a = self._build_a(self.S,)
-        a_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Actor')
-
-        # assign self.a = a in memory when calculating q for td_error,
-        # otherwise the self.a is from Actor when updating Actor
-        q = self._build_c(self.S, self.a,)
-        c_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Critic')
-
-        target_update = [ema.apply(a_params), ema.apply(c_params)]
-        a_ = self._build_a(self.S_, reuse=True, custom_getter=ema_getter)
+        target_update = [ema.apply(a_params), ema.apply(c_params)]      # soft update operation
+        a_ = self._build_a(self.S_, reuse=True, custom_getter=ema_getter)   # replaced target parameters
         q_ = self._build_c(self.S_, a_, reuse=True, custom_getter=ema_getter)
 
-        with tf.control_dependencies(target_update):
+        a_loss = - tf.reduce_mean(q)  # maximize the q
+        self.atrain = tf.train.AdamOptimizer(LR_A).minimize(a_loss, var_list=a_params)
+
+        with tf.control_dependencies(target_update):    # soft replacement happened at here
             q_target = self.R + GAMMA * q_
-            # in the feed_dict for the td_error, the self.a should change to actions in memory
             td_error = tf.losses.mean_squared_error(labels=q_target, predictions=q)
-            a_loss = - tf.reduce_mean(q)  # maximize the q
-            self.atrain = tf.train.AdamOptimizer(LR_A).minimize(a_loss, var_list=a_params)
             self.ctrain = tf.train.AdamOptimizer(LR_C).minimize(td_error, var_list=c_params)
 
         self.sess.run(tf.global_variables_initializer())
@@ -112,7 +112,6 @@ def _build_c(self, s, a, reuse=None, custom_getter=None):
 
 ###############################  training  ####################################
 
-
 env = gym.make(ENV_NAME)
 env = env.unwrapped
 env.seed(1)