DDPG_TF2/test.py at main · gerkone/DDPG_TF2 · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
from ddpg import Agent
import gym
from gym import wrappers
import os
import numpy as np
import matplotlib.pyplot as plt

N_EPISODES = 10000

def main():
    #get simulation environment
    env = gym.make("Pendulum-v0")
    state_dims = [len(env.observation_space.low)]
    action_dims = [len(env.action_space.low)]
    action_boundaries = [env.action_space.low, env.action_space.high]
    print(action_boundaries)
    #create agent with environment parameters
    agent = Agent(state_dims = state_dims, action_dims = action_dims,
                action_boundaries = action_boundaries, actor_lr = 5 * 1e-3,
                critic_lr = 2*1e-2, batch_size = 128, gamma = 0.99, rand_steps = 2,
                buf_size = int(1e6), tau = 0.001, fcl1_size = 400, fcl2_size = 600)
    np.random.seed(0)
    scores = []
    #training loop: call remember on predicted states and train the models
    episode = 0
    for i in range(N_EPISODES):
        #get initial state
        state = env.reset()
        terminal = False
        score = 0
        #proceed until reaching an exit state
        while not terminal:
            #predict new action
            action = agent.get_action(state, episode)
            #perform the transition according to the predicted action
            state_new, reward, terminal, info = env.step(action)
            #store the transaction in the memory
            agent.remember(state, state_new, action, reward, terminal)
            #adjust the weights according to the new transaction
            agent.learn()
            #iterate to the next state
            state = state_new
            score += reward
            env.render()
        scores.append(score)
        print("Iteration {:d} --> score {:.2f}. Running average {:.2f}".format( i, score, np.mean(scores)))
        episode += 1
    plt.plot(scores)
    plt.xlabel("Episode")
    plt.ylabel("Cumulate reward")
    plt.show()


if __name__ == "__main__":
    #tell tensorflow to train with GPU 0
    os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
    os.environ["CUDA_VISIBLE_DEVICES"] = "0"
    main()