Skip to content

Commit

Permalink
Add notations
Browse files Browse the repository at this point in the history
  • Loading branch information
xlnwel committed Oct 22, 2019
1 parent 94c6246 commit 6b07e01
Show file tree
Hide file tree
Showing 16 changed files with 93 additions and 58 deletions.
5 changes: 2 additions & 3 deletions algo/ddqn/agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,6 @@ def __init__(self,
# hyperparameters
self.gamma = args['gamma'] if 'gamma' in args else .99
self.update_freq = args['update_freq']
self.loss_type = args['loss_type']
self.target_update_freq = args['target_update_freq']
self.update_step = 0

Expand Down Expand Up @@ -84,7 +83,7 @@ def __init__(self,
device=device)

# learing rate schedule
decay_duration = float(self.args['max_steps']) / 10 # 2e7
decay_duration = float(self.args['max_steps'])
lr = float(self.args['Qnets']['learning_rate'])
end_lr = float(self.args['Qnets']['end_lr'])
self.lr_schedule = PiecewiseSchedule([(0, lr), (decay_duration / 8, lr), (decay_duration / 4, end_lr)],
Expand Down Expand Up @@ -297,7 +296,7 @@ def _loss(self):
self.Qnets.Q_next_target, self.gamma, self.data['steps'])

with tf.name_scope('loss'):
loss_func = huber_loss if self.loss_type == 'huber' else tf.square
loss_func = huber_loss if self.args['loss_type'] == 'huber' else tf.square
if self.buffer_type == 'proportional':
loss = tf.reduce_mean(self.data['IS_ratio'][:, None] * loss_func(Q_error), name='loss')
else:
Expand Down
1 change: 1 addition & 0 deletions algo/ddqn/args.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ agent:
learning_rate: 1.e-4
end_lr: 5.e-5
clip_norm: 10
epsilon: 0.0003125
env_stats:
times: 1
stats: [score, score_mean, score_std, score_best, epslen_mean, epslen_std]
Expand Down
2 changes: 1 addition & 1 deletion algo/ddqn/base_net.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ def _head_net(self, x, out_dim, name=None):
def net(x, out_dim):
layer = self.noisy if self.use_noisy else self.dense
name_fn = lambda i: f'noisy_{i}' if self.use_noisy else f'dense_{i}'
x = layer(x, 512, name=name_fn(1))
x = layer(x, 256, name=name_fn(1))
x = norm_activation(x, norm=self.dense_norm, activation=tf.nn.relu)
x = layer(x, out_dim, name=name_fn(2))
return x
Expand Down
12 changes: 7 additions & 5 deletions algo/rainbow/agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,15 +58,18 @@ def _loss(self):

Tz = n_step_target(self.data['reward'], self.data['done'],
z_support[None, :], self.gamma, self.data['steps']) # [B, N]
# compute projection
Tz = tf.clip_by_value(Tz, v_min, v_max)[:, None, :] # [B, 1, N]
z_original = z_support[None, :, None] # [1, N, 1]
z_support = z_support[None, :, None] # [1, N, 1]

weight = tf.clip_by_value(1. - tf.abs(Tz - z_original) / delta_z, 0, 1) # [B, N, N]
dist_target = tf.reduce_sum(weight * self.Qnets.dist_next_target, axis=2) # [B, N]
# quotient = `[1 - numerator / (\Delta z)]_0^1` in Eq7.
quotient = tf.clip_by_value(1. - tf.abs(Tz - z_support) / delta_z, 0, 1) # [B, N, N]
dist_next = tf.expand_dims(self.Qnets.dist_next_target, 1) # [B, 1, N]
dist_target = tf.reduce_sum(quotient * dist_next, axis=2) # [B, N]
dist_target = tf.stop_gradient(dist_target)

kl_loss = tf.nn.softmax_cross_entropy_with_logits(labels=dist_target, logits=self.Qnets.logits)
loss = tf.reduce_mean(kl_loss, name='loss')
loss = tf.reduce_mean(self.data['IS_ratio'] * kl_loss, name='loss')

with tf.name_scope('priority'):
priority = self._rescale(kl_loss)
Expand All @@ -80,5 +83,4 @@ def _log_info(self):
if self.buffer_type == 'proportional':
stats_summary('priority', self.priority, max=True, std=True)
stats_summary('Q', self.Qnets.Q, max=True)
stats_summary('prob', self.Qnets.dist, max=True)
tf.summary.scalar('loss_', self.loss)
1 change: 1 addition & 0 deletions algo/rainbow/args.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ agent:
learning_rate: 1.e-4
end_lr: 5.e-5
clip_norm: 10
epsilon: 0.0003125
env_stats:
times: 1
stats: [score, score_mean, score_std, score_best, epslen_mean, epslen_std]
Expand Down
5 changes: 2 additions & 3 deletions algo/rainbow/network.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ def _build_graph(self):
self.best_action = select_action(Qs, 'best_action')
next_action = select_action(Qs_next, 'next_action')

# [B, 1, N], [B, 1, N]
# [B, N], [B, N]
self.logits = self._c51_action_value(self.action, logits, 'logits')
self.dist_next_target = self._c51_action_value(next_action, dist_next_target, 'dist_next_target')
# for tensorboard bookkeeping
Expand Down Expand Up @@ -73,6 +73,5 @@ def _c51_action_value(self, action, values, name):
with tf.name_scope(name):
action = action[..., None]
value = tf.gather_nd(values, action, batch_dims=1)
value = tf.expand_dims(value, axis=1)

return value
return value
22 changes: 10 additions & 12 deletions algo/rainbow_iqn/agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,6 @@ def __init__(self,
log_stats=False,
device=None,
reuse=None):

super().__init__(name, args,
env_args, buffer_args,
sess_config=sess_config,
Expand Down Expand Up @@ -52,18 +51,17 @@ def _loss(self):
def tiled_n_step_target():
n_quantiles = self.args['Qnets']['N_prime']

reward_tiled = tf.reshape(tf.tile(self.data['reward'], [n_quantiles, 1]),
[n_quantiles, -1, 1])
done_tiled = tf.reshape(tf.tile(self.data['done'], [n_quantiles, 1]),
[n_quantiles, -1, 1])
steps_tiled = tf.reshape(tf.tile(self.data['steps'], [n_quantiles, 1]),
[n_quantiles, -1, 1])
return n_step_target(reward_tiled, done_tiled,
reward = self.data['reward'][None, ...]
done = self.data['done'][None, ...]
steps = self.data['steps'][None, ...]
return n_step_target(reward, done,
self.Qnets.quantile_values_next_target,
self.gamma, steps_tiled)

self.gamma, steps)
def quantile_regression_loss(u):
# [B, N, N']
abs_part = tf.abs(self.Qnets.quantiles - tf.where(u < 0, tf.ones_like(u), tf.zeros_like(u)))

huber = huber_loss(u, delta=self.args['Qnets']['delta'])

qr_loss = tf.reduce_sum(tf.reduce_mean(abs_part * huber, axis=2), axis=1) # [B]
Expand All @@ -75,10 +73,10 @@ def quantile_regression_loss(u):
self.Qnets.Q_next_target, self.gamma, self.data['steps'])

with tf.name_scope('loss'):
quantile_values_target = tiled_n_step_target()
quantile_values_target = tiled_n_step_target() # [N', B, 1]
quantile_values_target = tf.transpose(quantile_values_target, [1, 2, 0]) # [B, 1, N']
quantile_values = tf.transpose(self.Qnets.quantile_values, [1, 0, 2]) # [B, N, 1]
quantile_error = tf.abs(quantile_values - quantile_values_target)
quantile_error = quantile_values_target - quantile_values # [B, N, N']

loss = quantile_regression_loss(quantile_error)

Expand Down
2 changes: 1 addition & 1 deletion algo/rainbow_iqn/args.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,6 @@ agent:
frame_history_len: 4
update_freq: 4
n_steps: 3
loss_type: huber # huber or mse
target_update_freq: 8000 # we count update step, which is 4 times frame steps

Qnets:
Expand All @@ -43,6 +42,7 @@ agent:
learning_rate: 1.e-4
end_lr: 5.e-5
clip_norm: 10
epsilon: 0.0003125
env_stats:
times: 1
stats: [score, score_mean, score_std, score_best, epslen_mean, epslen_std]
Expand Down
36 changes: 22 additions & 14 deletions algo/rainbow_iqn/network.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,27 +57,33 @@ def _build_graph(self):

# quantile_values for regression loss
# Q for priority required by PER
self.quantile_values, self.Q = self._iqn_values(self.action, self.N, quantile_values, Qs)
# [N, B, 1], [B, 1]
self.quantile_values, self.Q = self._iqn_values(self.action, self.N, quantile_values, Qs, 'quantile_q')
# [N', B, 1], [B, 1]
self.quantile_values_next_target, self.Q_next_target = self._iqn_values(next_action,
self.N_prime,
quantile_values_next_target,
Qs_next_target)
Qs_next_target,
'next_quantile_q')

def _iqn_net(self, x, n_quantiles, batch_size, out_dim,
psi_net, phi_net, f_net, name, reuse=None):
quantile_embedding_dim = self.args['quantile_embedding_dim']

with tf.variable_scope(name, reuse=reuse):
# 𝜓 function in the paper
# [N*B, H]
x_tiled = psi_net(x, n_quantiles)

h_dim = x_tiled.shape.as_list()[1]

# 𝜙 function in the paper
# [B, N, 1], [N*B, H]
quantiles, x_quantiles = phi_net(n_quantiles, batch_size, quantile_embedding_dim, h_dim)
# Combine outputs of psi and phi
y = x_tiled * x_quantiles
y = x_tiled * x_quantiles # [N*B, H]
# f function in the paper
# [N, B, O], [B, O]
v_qv, v = f_net(y, 1, n_quantiles, batch_size, name='value_net')
a_qv, a = f_net(y, out_dim, n_quantiles, batch_size, name='adv_net')

Expand All @@ -90,7 +96,7 @@ def _iqn_net(self, x, n_quantiles, batch_size, out_dim,
def _psi_net(self, x, n_quantiles):
with tf.variable_scope('psi_net'):
x = self._conv_net(x)
x_tiled = tf.tile(x, [n_quantiles, 1])
x_tiled = tf.tile(x, [n_quantiles, 1]) # [N*B, H]

return x_tiled

Expand All @@ -99,34 +105,36 @@ def _phi_net(self, n_quantiles, batch_size, quantile_embedding_dim, h_dim):
quantile_shape = [n_quantiles * batch_size, 1]
quantiles = tf.random.uniform(quantile_shape, minval=0, maxval=1) # [N*B, 1]
quantiles_tiled = tf.tile(quantiles, [1, quantile_embedding_dim]) # [N*B, D]
# returned quantiles for computing quantile regression loss
# returned quantiles for computing quantile regression loss, [B, N, 1]
quantiles_reformed = tf.transpose(tf.reshape(quantiles, [n_quantiles, batch_size, 1]), [1, 0, 2])

with tf.variable_scope('phi_net'):
pi = tf.constant(np.pi)
degrees = tf.cast(tf.range(quantile_embedding_dim), tf.float32) * pi * quantiles_tiled
x_quantiles = tf.cos(degrees)
x_quantiles = tf.layers.dense(x_quantiles, h_dim)
x_quantiles = tf.nn.relu(x_quantiles)
x_quantiles = tf.nn.relu(x_quantiles) # [N*B, H]

return quantiles_reformed, x_quantiles

def _f_net(self, x, out_dim, n_quantiles, batch_size, name=None):
name = f'{name}_f_net' if name else 'f_net'
with tf.variable_scope(name):
quantile_values = self._head_net(x, out_dim)
quantile_values = tf.reshape(quantile_values, (n_quantiles, batch_size, out_dim))
q = tf.reduce_mean(quantile_values, axis=0)
quantile_values = self._head_net(x, out_dim) # [N*B, O]
quantile_values = tf.reshape(quantile_values, (n_quantiles, batch_size, out_dim)) # [N, B, O]
q = tf.reduce_mean(quantile_values, axis=0) # [B, O]

return quantile_values, q

def _iqn_values(self, action, n_quantiles, quantile_values, Qs):
with tf.name_scope('action_values'):
action_tiled = tf.reshape(tf.tile(action, [n_quantiles]),
def _iqn_values(self, action, n_quantiles, quantile_values, Qs, name):
with tf.name_scope(name):
action_tiled = tf.reshape(tf.tile(action, [n_quantiles]),
[n_quantiles, -1])
quantile_values = tf.reduce_sum(tf.one_hot(action_tiled, self.n_actions)
* quantile_values, axis=2, keepdims=True)
* quantile_values, axis=2, keepdims=True,
name='quantile_values')
q = tf.reduce_sum(tf.one_hot(action, self.n_actions)
* Qs, axis=1, keepdims=True)
* Qs, axis=1, keepdims=True,
name='q')

return quantile_values, q
22 changes: 14 additions & 8 deletions algo/single_train.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,8 +38,8 @@ def train(agent):
start_time = time.time()
while step <= max_steps:
train_duration, (scores, epslens, step, itr) = timeit(agent.train, step, itr)
pwc(f'Training Duration: {train_duration:2f}s\n', 'blue')
pwc(f'Average Score: {np.mean(scores):2f}\n', 'blue')
pwc(f'Training Duration: {train_duration:2f}s', 'blue')
pwc(f'Average Score: {np.mean(scores):2f}', 'blue')
pwc(f'Average Epslen: {np.mean(epslens):2f}', 'blue')

eval_duration, (eval_scores, eval_epslens) = timeit(agent.eval)
Expand All @@ -53,10 +53,10 @@ def train(agent):
epslen_mean = np.mean(eval_epslens)
epslen_std = np.mean(eval_epslens)

if score_best_mean > score_mean:
if score_mean > score_best_mean:
score_best_mean = score_mean
agent.save()

agent.record_stats(global_step=step, score=score, score_mean=score_mean,
score_std=score_std, score_best=score_best,
epslen_mean=epslen_mean, epslen_std=epslen_std)
Expand All @@ -67,7 +67,6 @@ def train(agent):
'Episode': itr,
'TimeElapsed': f'{time.time() - start_time:.2f}s',
'Score': score,
'TrainScoreMean': np.mean(agent.env.get_episode_rewards()[-20:]),
'ScoreMean': score_mean,
'ScoreStd': score_std,
'ScoreBest': score_best,
Expand Down Expand Up @@ -101,9 +100,16 @@ def main(env_args, agent_args, buffer_args, render=False, restore=False):

agent_args['env_stats']['times'] = 1
sess_config = get_sess_config(2)
agent = Agent('Agent', agent_args, env_args, buffer_args,
save=True, log=True, log_tensorboard=True,
log_stats=True, log_params=False)
agent = Agent('Agent',
agent_args,
env_args,
buffer_args,
sess_config=sess_config,
save=True,
log=True,
log_tensorboard=True,
log_stats=True,
log_params=False)

if restore:
agent.restore()
Expand Down
8 changes: 4 additions & 4 deletions basic_model/layer.py
Original file line number Diff line number Diff line change
Expand Up @@ -247,11 +247,11 @@ def fixup_residual(self, x, norm=None, name=None):

y = x
with tf.variable_scope(name):
y = tf.nn.relu(y)
y = tf_utils.norm_activation(y, norm=norm, activation=tf.nn.relu)
y = y + get_bias('bias1')
y = conv(y)
y = y + get_bias('bias2')
y = tf.nn.relu(y)
y = tf_utils.norm_activation(y, norm=norm, activation=tf.nn.relu)
y = y + get_bias('bias3')
y = conv(y)
y = y * get_scale() + get_bias('bias4')
Expand All @@ -266,8 +266,8 @@ def net_fn(x):
with tf.variable_scope(f'block_{i}_{filters}'):
x = self.conv_norm_activation(x, filters, 3, padding='same', norm=conv_norm)
x = tf.layers.max_pooling2d(x, 3, 2, padding='same')
x = residual(x, f'residual_1')
x = residual(x, f'residual_2')
x = residual(x, name='residual_1')
x = residual(x, name='residual_2')

x = tf.nn.relu(x)
x = tf.layers.flatten(x)
Expand Down
4 changes: 2 additions & 2 deletions env/atari_wrappers.py
Original file line number Diff line number Diff line change
Expand Up @@ -291,9 +291,9 @@ def wrap_deepmind(env, episode_life=True, clip_rewards=True, frame_stack=False,

def make_deepmind_atari(args):
env = make_atari(args['name'])
if 'log_video' in args and args['log_video']:
if args['log_video']:
# put monitor in middle to properly record episodic information
env = gym.wrappers.Monitor(env, args['video_path'])
env = gym.wrappers.Monitor(env, args['video_path'], force=True)
env = wrap_deepmind(env, args['episode_life'])

return env
9 changes: 9 additions & 0 deletions readme.md
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,15 @@ Episodic rewards averaged over 100 episodes **at training time**.
<figcaption></figcaption>
</figure>

Compare to [Google's Dopamine](https://github.com/google/dopamine) shown below, our implementation manages to achieving better performance on Breakout

<figure>
<img src="results/dopamine-BreakoutNoFrameskip.png" alt="" width="1000">
<figcaption></figcaption>
</figure>

Source: https://google.github.io/dopamine/baselines/plots.html, each iteration stands for 250000 steps.

## Running

```shell
Expand Down
Binary file added results/dopamine-breakout.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
12 changes: 10 additions & 2 deletions run/grid_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,14 +8,15 @@


class GridSearch:
def __init__(self, args_file, train_func, render=False, n_trials=1, dir_prefix=''):
def __init__(self, args_file, train_func, render=False, n_trials=1, sub_process=False, dir_prefix=''):
args = load_args(args_file)
self.env_args = args['env']
self.agent_args = args['agent']
self.buffer_args = args['buffer'] if 'buffer' in args else {}
self.train_func = train_func
self.render = render
self.n_trials = n_trials
self.sub_process = sub_process
self.dir_prefix = dir_prefix

self.processes = []
Expand All @@ -24,7 +25,14 @@ def __call__(self, **kwargs):
self._dir_setup()
if kwargs == {} and self.n_trials == 1:
# if no argument is passed in, run the default setting
self.train_func(self.env_args, self.agent_args, self.buffer_args, self.render)
if self.sub_process:
p = Process(target=self.train_func,
args=(self.env_args, self.agent_args, self.buffer_args, self.render))
p.start()
time.sleep(1)
self.processes.append(p)
else:
self.train_func(self.env_args, self.agent_args, self.buffer_args, self.render)
else:
# do grid search
self.agent_args['model_name'] = 'GS'
Expand Down
Loading

0 comments on commit 6b07e01

Please sign in to comment.