Skip to content

Commit

Permalink
halfcheetah hyperparams
Browse files Browse the repository at this point in the history
  • Loading branch information
Panjete committed Nov 2, 2023
1 parent 6b607f5 commit a8ed9a7
Show file tree
Hide file tree
Showing 10 changed files with 79 additions and 67 deletions.
Binary file modified __pycache__/config.cpython-39.pyc
Binary file not shown.
Binary file modified agents/__pycache__/mujoco_agents.cpython-39.pyc
Binary file not shown.
121 changes: 61 additions & 60 deletions agents/mujoco_agents.py
Original file line number Diff line number Diff line change
Expand Up @@ -272,15 +272,16 @@ def __init__(self, observation_dim:int, action_dim:int, args = None, discrete:bo
self.reward_scale = self.hyperparameters["reward_scale"] ## For determining what weightage to give to reward and predicted reward

# Initialising predictors and quality estimators
print("MAX TRAJ LEN =", self.hyperparameters["maxtraj"])
#print("MAX TRAJ LEN =", self.hyperparameters["maxtraj"])
self.actor = Actor(self.alpha, self.observation_dim, self.action_dim, "Actor")
self.critic1 = Critic(self.observation_dim, self.action_dim, self.beta, "Critic1")
self.critic2 = Critic(self.observation_dim, self.action_dim, self.beta, "Critic2")
self.value = Value(self.observation_dim, self.beta, "Value")
self.target_value = Value(self.observation_dim, self.beta, "target_value")

self.cur_max_reward = 0.0
self.update_network_parameters(tau = 1)


def forward(self, observation: torch.FloatTensor):
#*********YOUR CODE HERE******************
actions, _ = self.actor.sample_normal(observation, r = False)
Expand All @@ -302,58 +303,61 @@ def update_network_parameters(self, tau = None):
self.target_value.load_state_dict(vs_dict)

## Filhaal learns one trajectory at a time, maybe try to implement a batch one?
def learn(self, traj):
def learn(self, trajs):
# ## May need to do np.append instead of list comprehension
traj = traj[0]
reward = torch.tensor(traj["reward"], dtype = torch.float).to(self.actor.device) ## Rewards earned in this trajectory
state_ = torch.tensor(traj["next_observation"], dtype = torch.float).to(self.actor.device) ## s' - the states the trajectory goes into after action a
state = torch.tensor(traj["observation"], dtype = torch.float).to(self.actor.device) ## s - the current states being analysed
action = torch.tensor(traj["action"], dtype = torch.float).to(self.actor.device) ## action a actually taken in the trajectory
done = torch.tensor(traj["terminal"], dtype = torch.float).to(self.actor.device) ## Terminal state or not?

#np.concatenate(numpy_arrays, axis=0)
#print("done = ", done)
value = self.value(state).view(-1) ## Collapse across batch dimension - anyways 1
value_ = self.target_value(state_).view(-1) ## Collapse across batch dimension - anyways 1
value_[-1] = 0.0

actions, log_probs = self.actor.sample_normal(state, r = False) ## Find actions and corres. log_prob suggested by actor network, Do not reparametrize
log_probs = log_probs#.view(-1)
q1_np = self.critic1.forward(state,actions) ## How good is the suggested state-action pair
q2_np = self.critic2.forward(state,actions) ## How good is the suggested state-action pair, second opinion
critic_value = torch.min(q1_np, q2_np) ## For removing over-estimation bias
critic_value = critic_value.view(-1)

self.value.optimizer.zero_grad()
value_target = critic_value - log_probs.view(-1) ##
value_loss = 0.5 * F.mse_loss(value, value_target) ## How far is the estimation of
value_loss.backward(retain_graph=True)
self.value.optimizer.step()

actions, log_probs = self.actor.sample_normal(state, r = True)
log_probs = log_probs.view(-1)
q1_np = self.critic1.forward(state,actions)
q2_np = self.critic2.forward(state,actions)
critic_value = torch.min(q1_np, q2_np)
critic_value = critic_value.view(-1)

actor_loss = torch.mean(log_probs- critic_value)
self.actor.optimizer.zero_grad()
actor_loss.backward(retain_graph=True)
self.actor.optimizer.step()

self.critic1.optimizer.zero_grad()
self.critic2.optimizer.zero_grad()
q_hat = self.reward_scale * reward + self.gamma * value_
q1_op = self.critic1.forward(state,action).view(-1)
q2_op = self.critic2.forward(state,action).view(-1)
critic_loss = 0.5 * (F.mse_loss(q1_op, q_hat) + F.mse_loss(q2_op, q_hat))
critic_loss.backward()
self.critic1.optimizer.step()
self.critic2.optimizer.step()

self.update_network_parameters()

rewards_earned = 0.0
for traj in trajs:
reward = torch.tensor(traj["reward"], dtype = torch.float).to(self.actor.device) ## Rewards earned in this trajectory
state_ = torch.tensor(traj["next_observation"], dtype = torch.float).to(self.actor.device) ## s' - the states the trajectory goes into after action a
state = torch.tensor(traj["observation"], dtype = torch.float).to(self.actor.device) ## s - the current states being analysed
action = torch.tensor(traj["action"], dtype = torch.float).to(self.actor.device) ## action a actually taken in the trajectory
done = torch.tensor(traj["terminal"], dtype = torch.float).to(self.actor.device) ## Terminal state or not?

#np.concatenate(numpy_arrays, axis=0)
value = self.value(state).view(-1) ## Collapse across batch dimension - anyways 1
value_ = self.target_value(state_).view(-1) ## Collapse across batch dimension - anyways 1
value_[-1] = 0.0

actions, log_probs = self.actor.sample_normal(state, r = False) ## Find actions and corres. log_prob suggested by actor network, Do not reparametrize
log_probs = log_probs#.view(-1)
q1_np = self.critic1.forward(state,actions) ## How good is the suggested state-action pair
q2_np = self.critic2.forward(state,actions) ## How good is the suggested state-action pair, second opinion
critic_value = torch.min(q1_np, q2_np) ## For removing over-estimation bias
critic_value = critic_value.view(-1)

self.value.optimizer.zero_grad()
value_target = critic_value - log_probs.view(-1) ##
value_loss = 0.5 * F.mse_loss(value, value_target) ## How far is the estimation of
value_loss.backward(retain_graph=True)
self.value.optimizer.step()

actions, log_probs = self.actor.sample_normal(state, r = True)
log_probs = log_probs.view(-1)
q1_np = self.critic1.forward(state,actions)
q2_np = self.critic2.forward(state,actions)
critic_value = torch.min(q1_np, q2_np)
critic_value = critic_value.view(-1)

actor_loss = torch.mean(log_probs- critic_value)
self.actor.optimizer.zero_grad()
actor_loss.backward(retain_graph=True)
self.actor.optimizer.step()

self.critic1.optimizer.zero_grad()
self.critic2.optimizer.zero_grad()
q_hat = self.reward_scale * reward + self.gamma * value_
q1_op = self.critic1.forward(state,action).view(-1)
q2_op = self.critic2.forward(state,action).view(-1)
critic_loss = 0.5 * (F.mse_loss(q1_op, q_hat) + F.mse_loss(q2_op, q_hat))
critic_loss.backward()
self.critic1.optimizer.step()
self.critic2.optimizer.step()

self.update_network_parameters()

rewards_earned += float(reward.sum())
#print("REWARDS EARNED IN THIS LEARNING CYCLE =", rewards_earned)
return rewards_earned/self.batch_size


@torch.no_grad()
Expand All @@ -376,16 +380,13 @@ def get_action(self, observation: torch.FloatTensor):
def train_iteration(self, env, envsteps_so_far, render=False, itr_num=None, **kwargs):
#*********YOUR CODE HERE******************
self.train()
# self.sampling = True
trajs = utils.sample_n_trajectories(env, self, self.hyperparameters["ntraj"], self.hyperparameters["maxtraj"], False)
# self.sampling = False
# self.hyperparameters["alpha"] = 1/(1 + envsteps_so_far/1000)
# upd = self.update(trajs)
self.learn(trajs)
if envsteps_so_far%1000 == 0:
cur_reward = self.learn(trajs)
if envsteps_so_far%1000 == 0 and cur_reward > self.cur_max_reward:
self.cur_max_reward = cur_reward
model_save_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "../best_models")
torch.save(self.state_dict(), os.path.join(model_save_path, "model_"+ self.args.env_name + "_"+ self.args.exp_name+".pth"))
return {'episode_loss': 0.0, 'trajectories': trajs, 'current_train_envsteps': self.hyperparameters["ntraj"]} #you can return more metadata if you want to
return {'episode_loss': cur_reward, 'trajectories': trajs, 'current_train_envsteps': self.hyperparameters["ntraj"]} #you can return more metadata if you want to



Expand Down
Binary file added best_models/model_HalfCheetah-v4_RL.pth
Binary file not shown.
Binary file modified best_models/model_Hopper-v4_RL.pth
Binary file not shown.
16 changes: 13 additions & 3 deletions config.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,7 @@
"reward_scale" : 2
},
"num_iteration": 100000,
"episode_len" : 3000

},

Expand Down Expand Up @@ -110,10 +111,19 @@
"RL":{
#You can add or change the keys here
"hyperparameters": {

"ntraj" : 50,
"maxtraj" : 3000,
"std_min": 0.001,
"gamma" : 0.99,
"alpha" : 0.0003,
"beta" : 0.0003,
"max_buffer_size" : 100000,
"prob_rand_sample_training" : 0.99,
"tau" : 0.005,
"reward_scale" : 2
},
"num_iteration": 100,

"num_iteration": 2000,
"episode_len" : 3000

},

Expand Down
Binary file modified models/model_Hopper-v4_RL.pth
Binary file not shown.
5 changes: 3 additions & 2 deletions scripts/train_agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ def setup_agent(args, configs):
def train_agent(args, configs):
logger = Logger(args.logdir)
max_ep_len = configs.get("episode_len", None) or env.spec.max_episode_steps
# print("EPISODE LEN =", max_ep_len)
# set random seeds
ptu.init_gpu(use_gpu=not args.no_gpu, gpu_id=args.which_gpu)

Expand Down Expand Up @@ -120,8 +121,8 @@ def train_agent(args, configs):
plt.plot(np.array(it_num), np.array(avg_rewards))
plt.title("Evaluation Average Reward vs Iteration Number")
plt.show()


plt.savefig(args.exp_name + "_LearningCurve.png")
return


def main():
Expand Down
Binary file modified utils/__pycache__/utils.cpython-39.pyc
Binary file not shown.
4 changes: 2 additions & 2 deletions utils/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ def sample_trajectory(
ob = env.reset()
obs, acs, rewards, next_obs, terminals, image_obs = [], [], [], [], [], []
steps = 0
print("MAX TRAJ LEN INSIDE FUNCTION 2=", max_length)
#print("MAX TRAJ LEN INSIDE FUNCTION 2=", max_length)
while True:
# render an image
if render:
Expand Down Expand Up @@ -93,7 +93,7 @@ def sample_n_trajectories(
):
"""Collect ntraj rollouts."""
trajs = []
print("MAX TRAJ LEN INSIDE FUNCTION =", max_length)
#print("MAX TRAJ LEN INSIDE FUNCTION =", max_length)
for _ in range(ntraj):
# collect rollout
traj = sample_trajectory(env, policy, max_length, render)
Expand Down

0 comments on commit a8ed9a7

Please sign in to comment.