halfcheetah hyperparams

Panjete · Nov 2, 2023 · a8ed9a7 · a8ed9a7
1 parent 6b607f5
commit a8ed9a7
Show file tree

Hide file tree

Showing 10 changed files with 79 additions and 67 deletions.
diff --git a/__pycache__/config.cpython-39.pyc b/__pycache__/config.cpython-39.pyc
diff --git a/agents/__pycache__/mujoco_agents.cpython-39.pyc b/agents/__pycache__/mujoco_agents.cpython-39.pyc
diff --git a/agents/mujoco_agents.py b/agents/mujoco_agents.py
@@ -272,15 +272,16 @@ def __init__(self, observation_dim:int, action_dim:int, args = None, discrete:bo
         self.reward_scale = self.hyperparameters["reward_scale"]    ## For determining what weightage to give to reward and predicted reward
 
         # Initialising predictors and quality estimators
-        print("MAX TRAJ LEN =", self.hyperparameters["maxtraj"])
+        #print("MAX TRAJ LEN =", self.hyperparameters["maxtraj"])
         self.actor = Actor(self.alpha, self.observation_dim, self.action_dim, "Actor")   
         self.critic1 = Critic(self.observation_dim, self.action_dim, self.beta, "Critic1")
         self.critic2 = Critic(self.observation_dim, self.action_dim, self.beta, "Critic2")
         self.value =  Value(self.observation_dim, self.beta, "Value")
         self.target_value =  Value(self.observation_dim, self.beta, "target_value")
+
+        self.cur_max_reward = 0.0
         self.update_network_parameters(tau = 1)
 
-
     def forward(self, observation: torch.FloatTensor):
         #*********YOUR CODE HERE******************
         actions, _ = self.actor.sample_normal(observation, r = False)
@@ -302,58 +303,61 @@ def update_network_parameters(self, tau = None):
         self.target_value.load_state_dict(vs_dict)
 
     ## Filhaal learns one trajectory at a time, maybe try to implement a batch one?
-    def learn(self, traj):
+    def learn(self, trajs):
         # ## May need to do np.append instead of list comprehension
-        traj = traj[0]
-        reward = torch.tensor(traj["reward"], dtype = torch.float).to(self.actor.device) ## Rewards earned in this trajectory
-        state_ = torch.tensor(traj["next_observation"], dtype = torch.float).to(self.actor.device) ## s' - the states the trajectory goes into after action a
-        state = torch.tensor(traj["observation"], dtype = torch.float).to(self.actor.device) ## s - the current states being analysed
-        action = torch.tensor(traj["action"], dtype = torch.float).to(self.actor.device) ## action a actually taken in the trajectory
-        done = torch.tensor(traj["terminal"], dtype = torch.float).to(self.actor.device) ## Terminal state or not?
-
-        #np.concatenate(numpy_arrays, axis=0)
-        #print("done = ", done)
-        value = self.value(state).view(-1) ## Collapse across batch dimension - anyways 1
-        value_ = self.target_value(state_).view(-1) ## Collapse across batch dimension - anyways 1
-        value_[-1] = 0.0
-
-        actions, log_probs = self.actor.sample_normal(state, r = False) ## Find actions and corres. log_prob suggested by actor network, Do not reparametrize
-        log_probs = log_probs#.view(-1)
-        q1_np = self.critic1.forward(state,actions) ## How good is the suggested state-action pair
-        q2_np = self.critic2.forward(state,actions) ## How good is the suggested state-action pair, second opinion 
-        critic_value = torch.min(q1_np, q2_np) ## For removing over-estimation bias
-        critic_value = critic_value.view(-1) 
-
-        self.value.optimizer.zero_grad()
-        value_target = critic_value - log_probs.view(-1) ## 
-        value_loss = 0.5 * F.mse_loss(value, value_target) ## How far is the estimation of 
-        value_loss.backward(retain_graph=True)
-        self.value.optimizer.step()
-
-        actions, log_probs = self.actor.sample_normal(state, r = True)
-        log_probs = log_probs.view(-1)
-        q1_np = self.critic1.forward(state,actions)
-        q2_np = self.critic2.forward(state,actions)
-        critic_value = torch.min(q1_np, q2_np)
-        critic_value = critic_value.view(-1)
-
-        actor_loss = torch.mean(log_probs- critic_value)
-        self.actor.optimizer.zero_grad()
-        actor_loss.backward(retain_graph=True)
-        self.actor.optimizer.step()
-
-        self.critic1.optimizer.zero_grad()
-        self.critic2.optimizer.zero_grad()
-        q_hat = self.reward_scale * reward + self.gamma * value_
-        q1_op = self.critic1.forward(state,action).view(-1)
-        q2_op = self.critic2.forward(state,action).view(-1)
-        critic_loss = 0.5 * (F.mse_loss(q1_op, q_hat) + F.mse_loss(q2_op, q_hat))
-        critic_loss.backward()
-        self.critic1.optimizer.step()
-        self.critic2.optimizer.step()
-
-        self.update_network_parameters()
-
+        rewards_earned = 0.0
+        for traj in trajs:
+            reward = torch.tensor(traj["reward"], dtype = torch.float).to(self.actor.device) ## Rewards earned in this trajectory
+            state_ = torch.tensor(traj["next_observation"], dtype = torch.float).to(self.actor.device) ## s' - the states the trajectory goes into after action a
+            state = torch.tensor(traj["observation"], dtype = torch.float).to(self.actor.device) ## s - the current states being analysed
+            action = torch.tensor(traj["action"], dtype = torch.float).to(self.actor.device) ## action a actually taken in the trajectory
+            done = torch.tensor(traj["terminal"], dtype = torch.float).to(self.actor.device) ## Terminal state or not?
+
+            #np.concatenate(numpy_arrays, axis=0)
+            value = self.value(state).view(-1) ## Collapse across batch dimension - anyways 1
+            value_ = self.target_value(state_).view(-1) ## Collapse across batch dimension - anyways 1
+            value_[-1] = 0.0
+
+            actions, log_probs = self.actor.sample_normal(state, r = False) ## Find actions and corres. log_prob suggested by actor network, Do not reparametrize
+            log_probs = log_probs#.view(-1)
+            q1_np = self.critic1.forward(state,actions) ## How good is the suggested state-action pair
+            q2_np = self.critic2.forward(state,actions) ## How good is the suggested state-action pair, second opinion 
+            critic_value = torch.min(q1_np, q2_np) ## For removing over-estimation bias
+            critic_value = critic_value.view(-1) 
+
+            self.value.optimizer.zero_grad()
+            value_target = critic_value - log_probs.view(-1) ## 
+            value_loss = 0.5 * F.mse_loss(value, value_target) ## How far is the estimation of 
+            value_loss.backward(retain_graph=True)
+            self.value.optimizer.step()
+
+            actions, log_probs = self.actor.sample_normal(state, r = True)
+            log_probs = log_probs.view(-1)
+            q1_np = self.critic1.forward(state,actions)
+            q2_np = self.critic2.forward(state,actions)
+            critic_value = torch.min(q1_np, q2_np)
+            critic_value = critic_value.view(-1)
+
+            actor_loss = torch.mean(log_probs- critic_value)
+            self.actor.optimizer.zero_grad()
+            actor_loss.backward(retain_graph=True)
+            self.actor.optimizer.step()
+
+            self.critic1.optimizer.zero_grad()
+            self.critic2.optimizer.zero_grad()
+            q_hat = self.reward_scale * reward + self.gamma * value_
+            q1_op = self.critic1.forward(state,action).view(-1)
+            q2_op = self.critic2.forward(state,action).view(-1)
+            critic_loss = 0.5 * (F.mse_loss(q1_op, q_hat) + F.mse_loss(q2_op, q_hat))
+            critic_loss.backward()
+            self.critic1.optimizer.step()
+            self.critic2.optimizer.step()
+
+            self.update_network_parameters()
+
+            rewards_earned += float(reward.sum())
+        #print("REWARDS EARNED IN THIS LEARNING CYCLE =", rewards_earned)
+        return  rewards_earned/self.batch_size
 
 
     @torch.no_grad()
@@ -376,16 +380,13 @@ def get_action(self, observation: torch.FloatTensor):
     def train_iteration(self, env, envsteps_so_far, render=False, itr_num=None, **kwargs):
         #*********YOUR CODE HERE******************
         self.train()
-        # self.sampling = True
         trajs = utils.sample_n_trajectories(env, self, self.hyperparameters["ntraj"], self.hyperparameters["maxtraj"], False)
-        # self.sampling = False
-        # self.hyperparameters["alpha"] = 1/(1 + envsteps_so_far/1000)
-        # upd = self.update(trajs)
-        self.learn(trajs)
-        if envsteps_so_far%1000 == 0:
+        cur_reward = self.learn(trajs)
+        if envsteps_so_far%1000 == 0 and cur_reward > self.cur_max_reward:
+            self.cur_max_reward = cur_reward
             model_save_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "../best_models")
             torch.save(self.state_dict(), os.path.join(model_save_path, "model_"+ self.args.env_name + "_"+ self.args.exp_name+".pth"))
-        return {'episode_loss': 0.0, 'trajectories': trajs, 'current_train_envsteps': self.hyperparameters["ntraj"]} #you can return more metadata if you want to
+        return {'episode_loss': cur_reward, 'trajectories': trajs, 'current_train_envsteps': self.hyperparameters["ntraj"]} #you can return more metadata if you want to
 
 
 

diff --git a/best_models/model_HalfCheetah-v4_RL.pth b/best_models/model_HalfCheetah-v4_RL.pth
diff --git a/best_models/model_Hopper-v4_RL.pth b/best_models/model_Hopper-v4_RL.pth
diff --git a/config.py b/config.py
@@ -77,6 +77,7 @@
                 "reward_scale" : 2
             },
             "num_iteration": 100000,
+            "episode_len" : 3000
 
         },
 
@@ -110,10 +111,19 @@
         "RL":{
             #You can add or change the keys here
                "hyperparameters": {
-
+                "ntraj" : 50,
+                "maxtraj" : 3000,
+                "std_min": 0.001,
+                "gamma" : 0.99,
+                "alpha" : 0.0003,
+                "beta" : 0.0003,
+                "max_buffer_size" : 100000,
+                "prob_rand_sample_training" : 0.99,
+                "tau" : 0.005,
+                "reward_scale" : 2
             },
-            "num_iteration": 100,
-
+            "num_iteration": 2000,
+            "episode_len" : 3000
 
         },
 

diff --git a/models/model_Hopper-v4_RL.pth b/models/model_Hopper-v4_RL.pth
diff --git a/scripts/train_agent.py b/scripts/train_agent.py
@@ -50,6 +50,7 @@ def setup_agent(args, configs):
 def train_agent(args, configs):
     logger = Logger(args.logdir)
     max_ep_len = configs.get("episode_len", None) or env.spec.max_episode_steps
+    # print("EPISODE LEN =", max_ep_len)
     # set random seeds
     ptu.init_gpu(use_gpu=not args.no_gpu, gpu_id=args.which_gpu)
 
@@ -120,8 +121,8 @@ def train_agent(args, configs):
     plt.plot(np.array(it_num), np.array(avg_rewards))
     plt.title("Evaluation Average Reward vs Iteration Number")
     plt.show()
-
-
+    plt.savefig(args.exp_name + "_LearningCurve.png")
+    return
 
 
 def main():

diff --git a/utils/__pycache__/utils.cpython-39.pyc b/utils/__pycache__/utils.cpython-39.pyc
diff --git a/utils/utils.py b/utils/utils.py
@@ -19,7 +19,7 @@ def sample_trajectory(
     ob = env.reset()
     obs, acs, rewards, next_obs, terminals, image_obs = [], [], [], [], [], []
     steps = 0
-    print("MAX TRAJ LEN INSIDE FUNCTION 2=", max_length)
+    #print("MAX TRAJ LEN INSIDE FUNCTION 2=", max_length)
     while True:
         # render an image
         if render:
@@ -93,7 +93,7 @@ def sample_n_trajectories(
 ):
     """Collect ntraj rollouts."""
     trajs = []
-    print("MAX TRAJ LEN INSIDE FUNCTION =", max_length)
+    #print("MAX TRAJ LEN INSIDE FUNCTION =", max_length)
     for _ in range(ntraj):
         # collect rollout
         traj = sample_trajectory(env, policy, max_length, render)