# Inspired by http://karpathy.github.io/2016/05/31/rl/

from torch.autograd import Variable

import collections
import gym
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np


class PolicyNetwork(nn.Module):
    def __init__(self, state_size, action_size):
        super(PolicyNetwork, self).__init__()
        self.network = nn.Sequential(
            nn.Linear(state_size, 24),
            nn.Tanh(),
            nn.Linear(24, 48),
            nn.Tanh(),
            nn.Linear(48, action_size),
            nn.Softmax(dim=1)
        )

    def forward(self, state):
        return self.network(state)


class Agent():
    def __init__(self, state_size, action_size, learning_rate, learning_rate_decay):
        self.policy_network = PolicyNetwork(state_size, action_size)
        self.action_size = action_size
        self.optimizer = optim.Adam(
            self.policy_network.parameters(), lr=learning_rate, weight_decay=learning_rate_decay
        )

    def act(self, state):
        action_probs = self.policy_network(
            Variable(torch.from_numpy(state).float().unsqueeze(0))
        ).squeeze(0).data.numpy()
        return np.random.choice(self.action_size, p=action_probs)

    def learn(self, cumulative_reward, states, actions):
        action_log_probs = torch.log(self.policy_network(Variable(torch.from_numpy(states).float())))
        actions = Variable(torch.from_numpy(actions).long())
        self.optimizer.zero_grad()
        loss = -torch.sum(
            torch.gather(action_log_probs, dim=1, index=actions.unsqueeze(-1))
        ) * cumulative_reward
        loss.backward()
        self.optimizer.step()


def main():
    torch.manual_seed(0)
    np.random.seed(0)
    env_seed = 0

    num_episodes = 10000
    num_timesteps_max = 1000
    discount_rate = 1.0
    learning_rate = 0.001
    learning_rate_decay = 0.01
    state_size = 4
    action_size = 2

    # Train
    env = gym.make('CartPole-v0')
    env.seed(env_seed)
    agent = Agent(state_size, action_size, learning_rate, learning_rate_decay)
    cumulative_rewards = np.zeros([num_episodes])
    for episode_idx in range(num_episodes):
        state = env.reset()
        states = []
        actions = []
        for time_idx in range(num_timesteps_max):
            action = agent.act(state)

            # book-keeping
            states.append(state)
            actions.append(action)

            state, reward, done, _ = env.step(action)
            cumulative_rewards[episode_idx] += discount_rate**time_idx * reward

            if done: break

        if episode_idx % 100 == 0:
            print('Episode {}: Cumulative reward = {}'.format(episode_idx, cumulative_rewards[episode_idx]))

        # learn
        agent.learn(cumulative_rewards[episode_idx], np.array(states), np.array(actions))

    # Plot
    fig, ax = plt.subplots(1, 1)
    fig.set_size_inches(3, 2)
    ax.plot(cumulative_rewards, color='black')
    ax.axhline(195, color='black', linestyle='dashed', label='solved')
    ax.legend()
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)
    ax.set_ylabel('Cumulative Reward')
    ax.set_xlabel('Episode')
    ax.set_title('CartPole-v0 (PG)')
    filenames = ['pg.pdf', 'pg.png']
    for filename in filenames:
        fig.savefig(filename, bbox_inches='tight', dpi=200)
        print('Saved to {}'.format(filename))


if __name__ == '__main__':
    main()