# Inspired by https://keon.io/deep-q-learning/ from torch.autograd import Variable import collections import gym import matplotlib.pyplot as plt import numpy as np import torch import torch.nn as nn import torch.optim as optim class QNetwork(nn.Module): def __init__(self, state_size, action_size): super(QNetwork, self).__init__() self.network = nn.Sequential( nn.Linear(state_size, 24), nn.Tanh(), nn.Linear(24, 48), nn.Tanh(), nn.Linear(48, action_size) ) def forward(self, state): return self.network(state) class DQNAgent(): def __init__( self, exploration_rate, min_exploration_rate, exploration_rate_decay, learning_rate, learning_rate_decay, replay_batch_size, max_memory_size, discount_rate, state_size, action_size ): self.exploration_rate = exploration_rate self.min_exploration_rate = min_exploration_rate self.exploration_rate_decay = exploration_rate_decay self.replay_batch_size = replay_batch_size self.discount_rate = discount_rate self.state_size = state_size self.action_size = action_size self.memory = collections.deque(maxlen=max_memory_size) self.q_network = QNetwork(state_size, action_size) self.optimizer = optim.Adam( self.q_network.parameters(), lr=learning_rate, weight_decay=learning_rate_decay ) def remember(self, state, action, reward, next_state, done): self.memory.append((state, action, reward, next_state, done)) def act(self, state): if np.random.rand() <= self.exploration_rate: return np.random.choice(self.action_size) else: q_values = self.q_network( Variable(torch.Tensor(state).unsqueeze(0)) ).squeeze(0).data.numpy() return np.argmax(q_values) def replay(self): batch = [ self.memory[i] for i in np.random.choice( len(self.memory), size=self.replay_batch_size ) ] for state, action, reward, next_state, done in batch: if done: target = reward else: next_q_values = self.q_network( Variable(torch.Tensor(next_state).unsqueeze(0)) ).squeeze(0).data.numpy() target = reward + self.discount_rate * np.max(next_q_values) self.optimizer.zero_grad() q_values = self.q_network( Variable(torch.Tensor(state)).unsqueeze(0) ).squeeze(0) loss = (q_values[action] - target)**2 loss.backward() self.optimizer.step() if self.exploration_rate > self.min_exploration_rate: self.exploration_rate *= self.exploration_rate_decay def main(): torch.manual_seed(0) np.random.seed(0) env_seed = 0 num_episodes = 1000 num_timesteps_max = 1000 replay_batch_size = 128 max_memory_size = 100000 discount_rate = 1.0 exploration_rate = 1.0 min_exploration_rate = 0.01 exploration_rate_decay = 0.995 learning_rate = 0.001 learning_rate_decay = 0.01 state_size = 4 action_size = 2 env = gym.make('CartPole-v0') env.seed(env_seed) agent = DQNAgent( exploration_rate, min_exploration_rate, exploration_rate_decay, learning_rate, learning_rate_decay, replay_batch_size, max_memory_size, discount_rate, state_size, action_size ) cumulative_rewards = np.zeros([num_episodes]) # Train for episode_idx in range(num_episodes): state = env.reset() for time_idx in range(num_timesteps_max): action = agent.act(state) next_state, reward, done, _ = env.step(action) # book-keeping agent.remember(state, action, reward, next_state, done) state = next_state cumulative_rewards[episode_idx] += reward if done: break if episode_idx % 100 == 0: print('Episode {}: Cumulative reward = {}'.format(episode_idx, cumulative_rewards[episode_idx])) # learn agent.replay() # Plot fig, ax = plt.subplots(1, 1) fig.set_size_inches(3, 2) ax.plot(cumulative_rewards, color='black') ax.axhline(195, color='black', linestyle='dashed', label='solved') ax.legend() ax.spines['top'].set_visible(False) ax.spines['right'].set_visible(False) ax.set_ylabel('Cumulative Reward') ax.set_xlabel('Episode') ax.set_title('CartPole-v0') filenames = ['dqn.pdf', 'dqn.png'] for filename in filenames: fig.savefig(filename, bbox_inches='tight', dpi=200) print('Saved to {}'.format(filename)) if __name__ == '__main__': main()