# Inspired by http://karpathy.github.io/2016/05/31/rl/ from torch.autograd import Variable import collections import gym import matplotlib.pyplot as plt import torch import torch.nn as nn import torch.optim as optim import numpy as np class PolicyNetwork(nn.Module): def __init__(self, state_size, action_size): super(PolicyNetwork, self).__init__() self.network = nn.Sequential( nn.Linear(state_size, 24), nn.Tanh(), nn.Linear(24, 48), nn.Tanh(), nn.Linear(48, action_size), nn.Softmax(dim=1) ) def forward(self, state): return self.network(state) class Agent(): def __init__(self, state_size, action_size, learning_rate, learning_rate_decay): self.policy_network = PolicyNetwork(state_size, action_size) self.action_size = action_size self.optimizer = optim.Adam( self.policy_network.parameters(), lr=learning_rate, weight_decay=learning_rate_decay ) def act(self, state): action_probs = self.policy_network( Variable(torch.from_numpy(state).float().unsqueeze(0)) ).squeeze(0).data.numpy() return np.random.choice(self.action_size, p=action_probs) def learn(self, cumulative_reward, states, actions): action_log_probs = torch.log(self.policy_network(Variable(torch.from_numpy(states).float()))) actions = Variable(torch.from_numpy(actions).long()) self.optimizer.zero_grad() loss = -torch.sum( torch.gather(action_log_probs, dim=1, index=actions.unsqueeze(-1)) ) * cumulative_reward loss.backward() self.optimizer.step() def main(): torch.manual_seed(0) np.random.seed(0) env_seed = 0 num_episodes = 10000 num_timesteps_max = 1000 discount_rate = 1.0 learning_rate = 0.001 learning_rate_decay = 0.01 state_size = 4 action_size = 2 # Train env = gym.make('CartPole-v0') env.seed(env_seed) agent = Agent(state_size, action_size, learning_rate, learning_rate_decay) cumulative_rewards = np.zeros([num_episodes]) for episode_idx in range(num_episodes): state = env.reset() states = [] actions = [] for time_idx in range(num_timesteps_max): action = agent.act(state) # book-keeping states.append(state) actions.append(action) state, reward, done, _ = env.step(action) cumulative_rewards[episode_idx] += discount_rate**time_idx * reward if done: break if episode_idx % 100 == 0: print('Episode {}: Cumulative reward = {}'.format(episode_idx, cumulative_rewards[episode_idx])) # learn agent.learn(cumulative_rewards[episode_idx], np.array(states), np.array(actions)) # Plot fig, ax = plt.subplots(1, 1) fig.set_size_inches(3, 2) ax.plot(cumulative_rewards, color='black') ax.axhline(195, color='black', linestyle='dashed', label='solved') ax.legend() ax.spines['top'].set_visible(False) ax.spines['right'].set_visible(False) ax.set_ylabel('Cumulative Reward') ax.set_xlabel('Episode') ax.set_title('CartPole-v0 (PG)') filenames = ['pg.pdf', 'pg.png'] for filename in filenames: fig.savefig(filename, bbox_inches='tight', dpi=200) print('Saved to {}'.format(filename)) if __name__ == '__main__': main()