Skip to content

Commit

Permalink
add algos
Browse files Browse the repository at this point in the history
  • Loading branch information
chenchenglong committed Oct 26, 2017
1 parent 6f52a0e commit 1221697
Show file tree
Hide file tree
Showing 16 changed files with 1,173 additions and 0 deletions.
191 changes: 191 additions & 0 deletions A2C.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,191 @@

import torch as th
from torch import nn
from torch.optim import Adam, RMSprop
import numpy as np

from common.Memory import ReplayMemory
from common.Model import ActorNetwork, CriticNetwork
from common.utils import entropy, index_to_one_hot, to_tensor_var


class A2C(object):
"""
An agent learned with Advantage Actor-Critic
- Actor takes state as input
- Critic takes both state and action as input
- agent interact with environment to collect experience
- agent training with experience to update policy
"""
def __init__(self, env, memory_capacity, state_dim, action_dim,
actor_hidden_size=32, actor_lr=0.001,
critic_hidden_size=32, critic_lr=0.001,
max_grad_norm=None, entropy_reg=0.01,
optimizer_type="rmsprop", alpha=0.99, epsilon=1e-08,
use_cuda=False, batch_size=10, n_steps=5,
reward_gamma=0.99, done_penalty=None,
epsilon_start=0.9, epsilon_end=0.05,
epsilon_decay=200, episodes_before_train=100):

self.memory = ReplayMemory(memory_capacity)

self.env = env
self.state_dim = state_dim
self.action_dim = action_dim
self.n_steps = n_steps
self.env_state = self.env.reset()
self.n_episodes = 0
self.done_penalty = done_penalty
self.reward_gamma = reward_gamma
self.episodes_before_train = episodes_before_train

self.max_grad_norm = max_grad_norm
self.entropy_reg = entropy_reg
self.batch_size = batch_size

# params for epsilon greedy
self.epsilon_start = epsilon_start
self.epsilon_end = epsilon_end
self.epsilon_decay = epsilon_decay

self.actor = ActorNetwork(self.state_dim, actor_hidden_size, self.action_dim, nn.functional.softmax)
self.critic = CriticNetwork(self.state_dim, self.action_dim, critic_hidden_size, 1)
if optimizer_type == "adam":
self.actor_optimizer = Adam(self.actor.parameters(), lr=actor_lr)
self.critic_optimizer = Adam(self.critic.parameters(), lr=critic_lr)
elif optimizer_type == "rmsprop":
self.actor_optimizer = RMSprop(
self.actor.parameters(), lr=actor_lr, alpha=alpha, eps=epsilon)
self.critic_optimizer = RMSprop(
self.critic.parameters(), lr=critic_lr, alpha=alpha, eps=epsilon)
self.use_cuda = use_cuda and th.cuda.is_available()
if self.use_cuda:
self.actor.cuda()

# discount roll out rewards
def _discount_reward(self, rewards, final_r):
discounted_r = np.zeros_like(rewards)
running_add = final_r
for t in reversed(range(0, len(rewards))):
running_add = running_add * self.reward_gamma + rewards[t]
discounted_r[t] = running_add
return discounted_r

# agent interact with the environment to collect experience
def interact(self):
states = []
actions = []
rewards = []
# take n steps
for i in range(self.n_steps):
states.append(self.env_state)
action = self.exploration_action(self.env_state)
next_state, reward, done, _ = self.env.step(action)
actions.append(index_to_one_hot(action, self.action_dim))
reward = self.done_penalty if done else reward
rewards.append(reward)
final_state = next_state
self.env_state = next_state
if done:
self.env_state = self.env.reset()
break
# discount reward
if done:
final_r = 0.0
self.n_episodes += 1
self.episode_done = True
else:
self.episode_done = False
final_action = self.action(final_state)
final_r = self.value(final_state, index_to_one_hot(final_action, self.action_dim))
rewards = self._discount_reward(rewards, final_r)

self.memory.push(states, actions, rewards)

# train on a roll out batch
def train(self):
if self.n_episodes <= self.episodes_before_train:
pass

batch = self.memory.sample(self.batch_size)
states_var = to_tensor_var(batch.states, self.use_cuda).view(-1, self.state_dim)
actions_var = to_tensor_var(batch.actions, self.use_cuda).view(-1, self.action_dim)
rewards_var = to_tensor_var(batch.rewards, self.use_cuda).view(-1, 1)

# update actor network
self.actor_optimizer.zero_grad()
# actions_var is with noise, while softmax_actions is the accurate predictions
softmax_actions = self.actor(states_var)
values = self.critic(states_var, actions_var).detach()
advantages = rewards_var - values
neg_logloss = - th.sum(softmax_actions * actions_var, 1)
pg_loss = th.mean(neg_logloss * advantages)
entropy_loss = th.mean(entropy(softmax_actions))
actor_loss = pg_loss + entropy_loss * self.entropy_reg
actor_loss.backward()
if self.max_grad_norm is not None:
nn.utils.clip_grad_norm(self.actor.parameters(), self.max_grad_norm)
self.actor_optimizer.step()

# update critic network
self.critic_optimizer.zero_grad()
target_values = rewards_var
values = self.critic(states_var, actions_var)
critic_loss = nn.MSELoss()(values, target_values)
critic_loss.backward()
if self.max_grad_norm is not None:
nn.utils.clip_grad_norm(self.critic.parameters(), self.max_grad_norm)
self.critic_optimizer.step()

# predict softmax action based on state
def _softmax_action(self, state):
state_var = to_tensor_var([state], self.use_cuda)
softmax_action_var = self.actor(state_var)
if self.use_cuda:
softmax_action = softmax_action_var.data.cpu().numpy()[0]
else:
softmax_action = softmax_action_var.data.numpy()[0]
return softmax_action

# predict action based on state, added random noise for exploration in training
def exploration_action(self, state):
softmax_action = self._softmax_action(state)
epsilon = self.epsilon_end + (self.epsilon_start - self.epsilon_end) * \
np.exp(-1. * self.n_steps / self.epsilon_decay)
if np.random.rand() < epsilon:
action = np.random.choice(self.action_dim, p=softmax_action)
else:
action = np.argmax(softmax_action)
return action

# predict action based on state for execution
def action(self, state):
softmax_action = self._softmax_action(state)
action = np.argmax(softmax_action)
return action

# evaluate value
def value(self, state, action):
state_var = to_tensor_var([state], self.use_cuda)
action_var = to_tensor_var([action], self.use_cuda)
value_var = self.critic(state_var, action_var)
if self.use_cuda:
value = value_var.data.cpu().numpy()[0]
else:
value = value_var.data.numpy()[0]
return value

# evaluation
def evaluation(self, env, eval_episodes=10):
rewards = 0
for i in range(eval_episodes):
state = env.reset()
action = self.action(state)
state, reward, done, _ = env.step(action)
rewards += reward
while not done:
action = self.action(state)
state, reward, done, _ = env.step(action)
rewards += reward
rewards /= float(eval_episodes)
return rewards
188 changes: 188 additions & 0 deletions DDPG.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,188 @@

import torch as th
import torch.nn as nn
from torch.optim import Adam, RMSprop
from copy import deepcopy
import numpy as np

from common.Memory import ReplayMemory
from common.Model import ActorNetwork, CriticNetwork
from common.utils import to_tensor_var


class DDPG(object):
"""
An agent learned with Deep Deterministic Policy Gradient using Actor-Critic framework
- Actor takes state as input
- Critic takes both state and action as input
- Critic uses gradient temporal-difference learning
"""
def __init__(self, env, memory_capacity, state_dim, action_dim,
actor_hidden_size=32, actor_lr=0.001,
actor_output_act=nn.functional.tanh,
critic_hidden_size=32, critic_lr=0.001,
max_grad_norm=None, max_steps=1000,
optimizer_type="rmsprop", alpha=0.99, epsilon=1e-08,
use_cuda=True, batch_size=10,
reward_gamma=0.99,
done_penalty=None, episodes_before_train=100,
target_tau=0.01, reward_scale=1.0,
critic_loss="huber", epsilon_start=0.99, epsilon_end=0.05,
epsilon_decay=200):

self.memory = ReplayMemory(memory_capacity)

self.env = env
self.state_dim = state_dim
self.action_dim = action_dim
self.env_state = self.env.reset()
self.n_episodes = 0
self.n_steps = 0
self.max_steps = max_steps
self.done_penalty = done_penalty
self.reward_gamma = reward_gamma
self.target_tau = target_tau
self.reward_scale = reward_scale

self.max_grad_norm = max_grad_norm
self.batch_size = batch_size
self.episodes_before_train = episodes_before_train
self.critic_loss = critic_loss

# params for epsilon greedy
self.epsilon_start = epsilon_start
self.epsilon_end = epsilon_end
self.epsilon_decay = epsilon_decay

self.actor = ActorNetwork(self.state_dim, actor_hidden_size, self.action_dim, actor_output_act)
self.critic = CriticNetwork(self.state_dim, self.action_dim, critic_hidden_size, 1)
# to ensure target network and learning network has the same weights
self.actor_target = deepcopy(self.actor)
self.critic_target = deepcopy(self.critic)

if optimizer_type == "adam":
self.actor_optimizer = Adam(self.actor.parameters(), lr=actor_lr)
self.critic_optimizer = Adam(self.critic.parameters(), lr=critic_lr)
elif optimizer_type == "rmsprop":
self.actor_optimizer = RMSprop(
self.actor.parameters(), lr=actor_lr, alpha=alpha, eps=epsilon)
self.critic_optimizer = RMSprop(
self.critic.parameters(), lr=critic_lr, alpha=alpha, eps=epsilon)

self.use_cuda = use_cuda and th.cuda.is_available()
if self.use_cuda:
self.actor.cuda()
self.critic.cuda()
self.actor_target.cuda()
self.critic_target.cuda()

# agent interact with the environment to collect experience
def interact(self):
if self.n_steps >= self.max_steps:
self.env_state = self.env.reset()
self.n_steps = 0
state = self.env_state
# take one step action and get one step reward
action = self.exploration_action(self.env_state)
next_state, reward, done, _ = self.env.step(action)
if done:
if self.done_penalty is not None:
reward = self.done_penalty
next_state = [0]*len(state)
self.env_state = self.env.reset()
self.n_episodes += 1
self.episode_done = True
else:
self.env_state = next_state
self.episode_done = False
self.n_steps += 1
self.memory.push(state, action, reward, next_state, done)

# soft update the actor target network or critic target network
def _soft_update_target(self, target, source):
for t, s in zip(target.parameters(), source.parameters()):
t.data.copy_(
(1. - self.target_tau) * t.data + self.target_tau * s.data)

# train on a sample batch
def train(self):
# do not train until exploration is enough
if self.n_episodes <= self.episodes_before_train:
pass

batch = self.memory.sample(self.batch_size)
state_var = to_tensor_var(batch.states, self.use_cuda).view(-1, self.state_dim)
action_var = to_tensor_var(batch.actions, self.use_cuda).view(-1, self.action_dim)
reward_var = to_tensor_var(batch.rewards, self.use_cuda).view(-1, 1)
next_state_var = to_tensor_var(batch.next_states, self.use_cuda).view(-1, self.state_dim)
done_var = to_tensor_var(batch.dones, self.use_cuda).view(-1, 1)

# estimate the target q with actor_target network and critic_target network
next_action_var = self.actor_target(next_state_var)
next_q = self.critic_target(next_state_var, next_action_var).detach()
target_q = self.reward_scale * reward_var + self.reward_gamma * next_q * (1. - done_var)

# update critic network
self.critic_optimizer.zero_grad()
# current Q values
current_q = self.critic(state_var, action_var)
# rewards is target Q values
if self.critic_loss == "huber":
critic_loss = nn.functional.smooth_l1_loss(current_q, target_q)
else:
critic_loss = nn.MSELoss()(current_q, target_q)
critic_loss.backward()
if self.max_grad_norm is not None:
nn.utils.clip_grad_norm(self.critic.parameters(), self.max_grad_norm)
self.critic_optimizer.step()

# update actor network
self.actor_optimizer.zero_grad()
# the accurate action prediction
action = self.actor(state_var)
# actor_loss is used to maximize the Q value for the predicted action
actor_loss = - self.critic(state_var, action)
actor_loss = actor_loss.mean()
actor_loss.backward()
if self.max_grad_norm is not None:
nn.utils.clip_grad_norm(self.actor.parameters(), self.max_grad_norm)
self.actor_optimizer.step()

# update actor target network and critic target network
if self.n_steps % 100 == 0 and self.n_steps > 0:
self._soft_update_target(self.critic_target, self.critic)
self._soft_update_target(self.actor_target, self.actor)

# predict action based on state, added random noise for exploration in training
def exploration_action(self, state):
action = self.action(state)
epsilon = self.epsilon_end + (self.epsilon_start - self.epsilon_end) * \
np.exp(-1. * self.n_steps / self.epsilon_decay)
# add noise
noise = np.random.randn(self.action_dim) * epsilon
action += noise
return action

# predict action based on state for execution (using current actor)
def action(self, state):
action_var = self.actor(to_tensor_var([state], self.use_cuda))
if self.use_cuda:
action = action_var.data.cpu().numpy()[0]
else:
action = action_var.data.numpy()[0]
return action

# evaluation
def evaluation(self, env, eval_episodes=10):
rewards = 0
for i in range(eval_episodes):
state = env.reset()
action = self.action(state)
state, reward, done, _ = env.step(action)
rewards += reward
while not done:
action = self.action(state)
state, reward, done, _ = env.step(action)
rewards += reward
rewards /= float(eval_episodes)
return rewards
Loading

0 comments on commit 1221697

Please sign in to comment.