Aphoh · Demosthen · Apr 2, 2021 · Apr 6, 2021 · Apr 6, 2021 · Apr 6, 2021
diff --git a/example_run.sh b/example_run.sh
@@ -1,12 +1,5 @@
-python3 rl_algos/StableBaselines.py --num_steps=300000 --exp_name=baseline_ppo --algo=ppo --library=rllib --one_day=15 --energy_in_state=T --price_in_state=F
-python3 rl_algos/StableBaselines.py --num_steps=300000 --exp_name=smirl_300_ppo --algo=ppo --library=rllib --one_day=15 --smirl_weight=3.00 --energy_in_state=T --price_in_state=F
-python3 rl_algos/StableBaselines.py --num_steps=300000 --exp_name=smirl_050_ppo --algo=ppo --library=rllib --one_day=15 --smirl_weight=0.50 --energy_in_state=T --price_in_state=F
-python3 rl_algos/StableBaselines.py --num_steps=300000 --exp_name=smirl_010_ppo --algo=ppo --library=rllib --one_day=15 --smirl_weight=0.10 --energy_in_state=T --price_in_state=F
-python3 rl_algos/StableBaselines.py --num_steps=300000 --exp_name=smirl_005_ppo --algo=ppo --library=rllib --one_day=15 --smirl_weight=0.05 --energy_in_state=T --price_in_state=F
-python3 rl_algos/StableBaselines.py --num_steps=300000 --exp_name=smirl_003_ppo --algo=ppo --library=rllib --one_day=15 --smirl_weight=0.03 --energy_in_state=T --price_in_state=F
-python3 rl_algos/StableBaselines.py --num_steps=300000 --exp_name=smirl_0003_ppo --algo=ppo --library=rllib --one_day=15 --smirl_weight=0.003 --energy_in_state=T --price_in_state=F
-python3 rl_algos/StableBaselines.py --num_steps=300000 --exp_name=smirl_0001_ppo --algo=ppo --library=rllib --one_day=15 --smirl_weight=0.001 --energy_in_state=T --price_in_state=F
-#python3 rl_algos/StableBaselines.py --exp_name=test --algo=ppo --library=tune --one_day=15 --energy_in_state=T --price_in_state=F
-
-
-
+python3 rl_algos/StableBaselines.py --num_steps=300000 --exp_name=sac_offline.1 --algo=sac --library=rllib --offline_sampling_prop=.1
+python3 rl_algos/StableBaselines.py --num_steps=300000 --exp_name=sac_offline.3 --algo=sac --library=rllib --offline_sampling_prop=.3
+python3 rl_algos/StableBaselines.py --num_steps=300000 --exp_name=sac_offline.5 --algo=sac --library=rllib --offline_sampling_prop=.5
+python3 rl_algos/StableBaselines.py --num_steps=300000 --exp_name=sac_offline.7 --algo=sac --library=rllib --offline_sampling_prop=.7
+python3 rl_algos/StableBaselines.py --num_steps=300000 --exp_name=sac_offline.9 --algo=sac --library=rllib --offline_sampling_prop=.9
diff --git a/gym-microgrid/gym_microgrid/envs/building_data.csv b/gym-microgrid/gym_microgrid/envs/building_data.csv
diff --git a/gym-socialgame/gym_socialgame/envs/agents.py b/gym-socialgame/gym_socialgame/envs/agents.py
@@ -136,6 +136,7 @@ class DeterministicFunctionPerson(Person):
 	def __init__(self, baseline_energy_df, points_multiplier = 1, response = 't', **kwargs):
 		super().__init__(baseline_energy_df, points_multiplier)
 		self.response = response
+		print("response type: ", response)
 
 	def threshold_response_func(self, points):
 		points = np.array(points) * self.points_multiplier

diff --git a/gym-socialgame/gym_socialgame/envs/socialgame_env.py b/gym-socialgame/gym_socialgame/envs/socialgame_env.py
@@ -7,6 +7,7 @@
 from gym_socialgame.envs.utils import price_signal
 from gym_socialgame.envs.agents import *
 from gym_socialgame.envs.reward import Reward
+import wandb
 from gym_socialgame.envs.buffers import (GaussianBuffer, GaussianCircularBuffer)
 
 class SocialGameEnv(gym.Env):
@@ -24,6 +25,8 @@ def __init__(self,
         reward_function = "log_cost_regularized",
         bin_observation_space=False,
         manual_tou_magnitude=.3,
+        person_type_string="c",
+        points_multiplier=10,
         smirl_weight=None,
         circ_buffer_size=None):
 
@@ -73,6 +76,8 @@ def __init__(self,
         self.hours_in_day = 10
         self.last_smirl_reward = None
         self.last_energy_reward = None
+        self.person_type_string = person_type_string
+        self.points_multiplier=points_multiplier
 
         self.day = 0
         self.days_of_week = [0, 1, 2, 3, 4]
@@ -201,7 +206,10 @@ def _create_agents(self):
         my_baseline_energy = pd.DataFrame(data = {"net_energy_use" : working_hour_energy})
 
         for i in range(self.number_of_participants):
-            player = CurtailAndShiftPerson(my_baseline_energy, points_multiplier = 10, response = 'l')
+            if self.person_type_string=="c":
+                player = CurtailAndShiftPerson(my_baseline_energy, points_multiplier = 10, response = 'l')
+            elif self.person_type_string=="d":
+                player = DeterministicFunctionPerson(my_baseline_energy, response=self.response_type_string, points_multiplier = self.points_multiplier)
             player_dict['player_{}'.format(i)] = player
 
         return player_dict
@@ -379,7 +387,7 @@ def step(self, action):
         self.action = action
 
         if not self.action_space.contains(action):
-            print("made it within the if statement in SG_E that tests if the the action space doesn't have the action")
+            print("made it within the if statement in SG_E that tests if the action space doesn't have the action")
             action = np.asarray(action)
             if self.action_space_string == 'continuous':
                 action = np.clip(action, -1, 1) #TODO: check if correct
@@ -408,6 +416,10 @@ def step(self, action):
         if self.use_smirl:
             self.buffer.add(observation)
 
+        # if not self.total_iter % 10:
+        #     print("Iteration: "+str(self.total_iter) + " reward: " + str(reward))
+        #     wandb.log({"environment_reward":reward})
+
         info = {}
         return observation, reward, done, info
 
@@ -510,19 +522,8 @@ class SocialGameMetaEnv(SocialGameEnvRLLib):
     def __init__(self,
         env_config,
         task = None):
-
-#        self.goal_direction = goal_direction if goal_direction else 1.0
-
-        self.task = (task if task else {
-            "person_type":np.random.choice([DeterministicFunctionPerson, CurtailAndShiftPerson]),
-            "points_multiplier":np.random.choice(range(20)),
-            "response":np.random.choice(['t','l', 's']),
-            "shiftable_load_frac":np.random.uniform(0, 1),
-            "curtailable_load_frac":np.random.uniform(0, 1),
-            "shiftByHours":np.random.choice(range(8), ),
-            "maxCurtailHours":np.random.choice(range(8),)
-        })
-
+        self.mode = env_config["mode"]
+        self.task = (task if task else self.sample_tasks(1)[0])
         super().__init__(
             env_config=env_config,
         )
@@ -533,24 +534,26 @@ def sample_tasks(self, n_tasks):
         """
         n_tasks will be passed in as a hyperparameter
         """
-        # points_multiplier = 1,
-        # response = 't'
-        # baseline_energy_df,
-        # points_multiplier = 1,
-        # shiftable_load_frac = .7,
-		# curtailable_load_frac = .4,
-        # shiftByHours = 3,
-        # maxCurtailHours=5,
-        # baseline_energy_df_variance =  # add random noise to the existing?
-
-        person_type = np.random.choice([DeterministicFunctionPerson, CurtailAndShiftPerson], size = (n_tasks, ))
-        points_multiplier = np.random.choice(range(20), size = (n_tasks, ))
-        response = np.random.choice(['t','l', 's'], size = (n_tasks, ))
-        shiftable_load_frac = np.random.uniform(0, 1, size = (n_tasks, ))
-        curtailable_load_frac = np.random.uniform(0, 1, size = (n_tasks, ))
-        shiftByHours = np.random.choice(range(8), (n_tasks, ))
-        maxCurtailHours=np.random.choice(range(8), (n_tasks, ))
-
+        if self.mode == "train":
+            print("SAMPLING TRAIN ENVIRONMENT")
+            person_type = np.random.choice([DeterministicFunctionPerson], size = (n_tasks, ))
+            points_multiplier = [10 for i in range(n_tasks)]
+            response = np.random.choice(['s', 'l', 't'], size = (n_tasks, ))
+            shiftable_load_frac = np.random.uniform(0, 1, size = (n_tasks, ))
+            curtailable_load_frac = np.random.uniform(0, 1, size = (n_tasks, ))
+            shiftByHours = np.random.choice(range(8), (n_tasks, ))
+            maxCurtailHours=np.random.choice(range(8), (n_tasks, ))
+        elif self.mode == "test":
+            print("SAMPLING TEST ENVIRONMENT")
+            person_type = [CurtailAndShiftPerson for i in range(n_tasks)]
+            points_multiplier = [10 for i in range(n_tasks)]
+            response = ['t' for i in range(n_tasks)]
+            shiftable_load_frac = [0.2 for i in range(n_tasks)]
+            curtailable_load_frac = [0.2 for i in range(n_tasks)]
+            shiftByHours = [2 for i in range(n_tasks)]
+            maxCurtailHours=[5 for i in range(n_tasks)]
+        else:
+            raise Exception("Please specify whether this is a training or evaluation run")
         task_parameters = {
             "person_type":person_type,
             "points_multiplier":points_multiplier,
@@ -566,7 +569,7 @@ def sample_tasks(self, n_tasks):
             temp_dict = {k: v[i] for k, v in task_parameters.items()}
             tasks_dicts.append(temp_dict)
 
-        return task_dicts
+        return tasks_dicts
 
 
     def set_task(self, task):
@@ -575,6 +578,7 @@ def set_task(self, task):
             task: task of the meta-learning environment
         """
         self.task=task
+        self.player_dict = self._create_agents()
         # self.person_type = task["person_type"]
         # self.points_multiplier = task["points_multiplier"]
         # self.response = task["response"]

diff --git a/requirements.txt b/requirements.txt
@@ -1,10 +1,12 @@
-cvxpy==1.1.7
 gym==0.17.3
 scikit-learn==0.23.2
 wandb==0.10.25
 stable-baselines3==0.11.1
 tensorboard==2.3.0
+tensorflow-gpu==2.3.2
+tensorflow-probability==0.11.1
 ray[rllib,tune]==1.2.0
+higher==0.2.1
 GPUtil==1.4.0
 pandas==1.1.5
 tables==3.6.1
diff --git a/rl_algos/.gitignore b/rl_algos/.gitignore
@@ -1 +1,4 @@
 logs/
+/ppo_output_sim_data
+/sac_output_sim_data2
+/offline_data