Skip to content

Commit

Permalink
improve performance
Browse files Browse the repository at this point in the history
  • Loading branch information
MorvanZhou committed Jan 27, 2018
1 parent ed018b4 commit 04d9f09
Show file tree
Hide file tree
Showing 3 changed files with 166 additions and 5 deletions.
9 changes: 7 additions & 2 deletions contents/9_Deep_Deterministic_Policy_Gradient_DDPG/DDPG.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@
import tensorflow as tf
import numpy as np
import gym
import time


np.random.seed(1)
tf.set_random_seed(1)
Expand All @@ -27,7 +29,7 @@
REPLACEMENT = [
dict(name='soft', tau=0.01),
dict(name='hard', rep_iter_a=600, rep_iter_c=500)
][1] # you can try different target replacement strategies
][0] # you can try different target replacement strategies
MEMORY_CAPACITY = 10000
BATCH_SIZE = 32

Expand Down Expand Up @@ -225,6 +227,7 @@ def sample(self, n):

var = 3 # control exploration

t1 = time.time()
for i in range(MAX_EPISODES):
s = env.reset()
ep_reward = 0
Expand Down Expand Up @@ -259,4 +262,6 @@ def sample(self, n):
print('Episode:', i, ' Reward: %i' % int(ep_reward), 'Explore: %.2f' % var, )
if ep_reward > -300:
RENDER = True
break
break

print('Running time: ', time.time()-t1)
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@
import tensorflow as tf
import numpy as np
import gym
import time


##################### hyper parameters ####################

Expand All @@ -35,7 +37,6 @@ def __init__(self, a_dim, s_dim, a_bound,):
self.memory = np.zeros((MEMORY_CAPACITY, s_dim * 2 + a_dim + 1), dtype=np.float32)
self.pointer = 0
self.sess = tf.Session()
self.a_replace_counter, self.c_replace_counter = 0, 0

self.a_dim, self.s_dim, self.a_bound = a_dim, s_dim, a_bound,
self.S = tf.placeholder(tf.float32, [None, s_dim], 's')
Expand Down Expand Up @@ -122,6 +123,7 @@ def _build_c(self, s, a, scope, trainable):
ddpg = DDPG(a_dim, s_dim, a_bound)

var = 3 # control exploration
t1 = time.time()
for i in range(MAX_EPISODES):
s = env.reset()
ep_reward = 0
Expand All @@ -144,5 +146,6 @@ def _build_c(self, s, a, scope, trainable):
ep_reward += r
if j == MAX_EP_STEPS-1:
print('Episode:', i, ' Reward: %i' % int(ep_reward), 'Explore: %.2f' % var, )
if ep_reward > -300:RENDER = True
break
# if ep_reward > -300:RENDER = True
break
print('Running time: ', time.time() - t1)
153 changes: 153 additions & 0 deletions contents/9_Deep_Deterministic_Policy_Gradient_DDPG/DDPG_update2.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,153 @@
"""
Deep Deterministic Policy Gradient (DDPG), Reinforcement Learning.
DDPG is Actor Critic based algorithm.
Pendulum example.
View more on my tutorial page: https://morvanzhou.github.io/tutorials/
Using:
tensorflow 1.0
gym 0.8.0
"""

import tensorflow as tf
import numpy as np
import gym
import time


##################### hyper parameters ####################

MAX_EPISODES = 200
MAX_EP_STEPS = 200
LR_A = 0.001 # learning rate for actor
LR_C = 0.002 # learning rate for critic
GAMMA = 0.9 # reward discount
TAU = 0.01 # soft replacement
MEMORY_CAPACITY = 10000
BATCH_SIZE = 32

RENDER = False
ENV_NAME = 'Pendulum-v0'


############################### DDPG ####################################


class DDPG(object):
def __init__(self, a_dim, s_dim, a_bound,):
self.memory = np.zeros((MEMORY_CAPACITY, s_dim * 2 + a_dim + 1), dtype=np.float32)
self.pointer = 0
self.sess = tf.Session()

self.a_dim, self.s_dim, self.a_bound = a_dim, s_dim, a_bound,
self.S = tf.placeholder(tf.float32, [None, s_dim], 's')
self.S_ = tf.placeholder(tf.float32, [None, s_dim], 's_')
self.R = tf.placeholder(tf.float32, [None, 1], 'r')

ema = tf.train.ExponentialMovingAverage(decay=1 - TAU)

def ema_getter(getter, name, *args, **kwargs):
return ema.average(getter(name, *args, **kwargs))

self.a = self._build_a(self.S,)
a_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Actor')

# assign self.a = a in memory when calculating q for td_error,
# otherwise the self.a is from Actor when updating Actor
q = self._build_c(self.S, self.a,)
c_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Critic')

target_update = [ema.apply(a_params), ema.apply(c_params)]
a_ = self._build_a(self.S_, reuse=True, custom_getter=ema_getter)
q_ = self._build_c(self.S_, a_, reuse=True, custom_getter=ema_getter)

with tf.control_dependencies(target_update):
q_target = self.R + GAMMA * q_
# in the feed_dict for the td_error, the self.a should change to actions in memory
td_error = tf.losses.mean_squared_error(labels=q_target, predictions=q)
a_loss = - tf.reduce_mean(q) # maximize the q
self.atrain = tf.train.AdamOptimizer(LR_A).minimize(a_loss, var_list=a_params)
self.ctrain = tf.train.AdamOptimizer(LR_C).minimize(td_error, var_list=c_params)

self.sess.run(tf.global_variables_initializer())

def choose_action(self, s):
return self.sess.run(self.a, {self.S: s[np.newaxis, :]})[0]

def learn(self):
indices = np.random.choice(MEMORY_CAPACITY, size=BATCH_SIZE)
bt = self.memory[indices, :]
bs = bt[:, :self.s_dim]
ba = bt[:, self.s_dim: self.s_dim + self.a_dim]
br = bt[:, -self.s_dim - 1: -self.s_dim]
bs_ = bt[:, -self.s_dim:]

self.sess.run(self.atrain, {self.S: bs})
self.sess.run(self.ctrain, {self.S: bs, self.a: ba, self.R: br, self.S_: bs_})

def store_transition(self, s, a, r, s_):
transition = np.hstack((s, a, [r], s_))
index = self.pointer % MEMORY_CAPACITY # replace the old memory with new memory
self.memory[index, :] = transition
self.pointer += 1

def _build_a(self, s, reuse=None, custom_getter=None):
trainable = True if reuse is None else False
with tf.variable_scope('Actor', reuse=reuse, custom_getter=custom_getter):
net = tf.layers.dense(s, 30, activation=tf.nn.relu, name='l1', trainable=trainable)
a = tf.layers.dense(net, self.a_dim, activation=tf.nn.tanh, name='a', trainable=trainable)
return tf.multiply(a, self.a_bound, name='scaled_a')

def _build_c(self, s, a, reuse=None, custom_getter=None):
trainable = True if reuse is None else False
with tf.variable_scope('Critic', reuse=reuse, custom_getter=custom_getter):
n_l1 = 30
w1_s = tf.get_variable('w1_s', [self.s_dim, n_l1], trainable=trainable)
w1_a = tf.get_variable('w1_a', [self.a_dim, n_l1], trainable=trainable)
b1 = tf.get_variable('b1', [1, n_l1], trainable=trainable)
net = tf.nn.relu(tf.matmul(s, w1_s) + tf.matmul(a, w1_a) + b1)
return tf.layers.dense(net, 1, trainable=trainable) # Q(s,a)


############################### training ####################################


env = gym.make(ENV_NAME)
env = env.unwrapped
env.seed(1)

s_dim = env.observation_space.shape[0]
a_dim = env.action_space.shape[0]
a_bound = env.action_space.high

ddpg = DDPG(a_dim, s_dim, a_bound)

var = 3 # control exploration
t1 = time.time()
for i in range(MAX_EPISODES):
s = env.reset()
ep_reward = 0
for j in range(MAX_EP_STEPS):
if RENDER:
env.render()

# Add exploration noise
a = ddpg.choose_action(s)
a = np.clip(np.random.normal(a, var), -2, 2) # add randomness to action selection for exploration
s_, r, done, info = env.step(a)

ddpg.store_transition(s, a, r / 10, s_)

if ddpg.pointer > MEMORY_CAPACITY:
var *= .9995 # decay the action randomness
ddpg.learn()

s = s_
ep_reward += r
if j == MAX_EP_STEPS-1:
print('Episode:', i, ' Reward: %i' % int(ep_reward), 'Explore: %.2f' % var, )
# if ep_reward > -300:RENDER = True
break

print('Running time: ', time.time() - t1)

0 comments on commit 04d9f09

Please sign in to comment.