"""
Title: Proximal Policy Optimization
Author: [Ilias Chrysovergis](https://twitter.com/iliachry)
Date created: 2021/06/24
Last modified: 2024/03/12
Description: Implementation of a Proximal Policy Optimization agent for the CartPole-v1 environment.
Accelerator: None
"""
"""
## Introduction
This code example solves the CartPole-v1 environment using a Proximal Policy Optimization (PPO) agent.
### CartPole-v1
A pole is attached by an un-actuated joint to a cart, which moves along a frictionless track.
The system is controlled by applying a force of +1 or -1 to the cart.
The pendulum starts upright, and the goal is to prevent it from falling over.
A reward of +1 is provided for every timestep that the pole remains upright.
The episode ends when the pole is more than 15 degrees from vertical, or the cart moves more than 2.4 units from the center.
After 200 steps the episode ends. Thus, the highest return we can get is equal to 200.
[CartPole-v1](https://gymnasium.farama.org/environments/classic_control/cart_pole/)
### Proximal Policy Optimization
PPO is a policy gradient method and can be used for environments with either discrete or continuous action spaces.
It trains a stochastic policy in an on-policy way. Also, it utilizes the actor critic method. The actor maps the
observation to an action and the critic gives an expectation of the rewards of the agent for the observation given.
Firstly, it collects a set of trajectories for each epoch by sampling from the latest version of the stochastic policy.
Then, the rewards-to-go and the advantage estimates are computed in order to update the policy and fit the value function.
The policy is updated via a stochastic gradient ascent optimizer, while the value function is fitted via some gradient descent algorithm.
This procedure is applied for many epochs until the environment is solved.

- [Proximal Policy Optimization Algorithms](https://arxiv.org/abs/1707.06347)
- [OpenAI Spinning Up docs - PPO](https://spinningup.openai.com/en/latest/algorithms/ppo.html)
### Note
This code example uses Keras and Tensorflow v2. It is based on the PPO Original Paper,
the OpenAI's Spinning Up docs for PPO, and the OpenAI's Spinning Up implementation of PPO using Tensorflow v1.
[OpenAI Spinning Up Github - PPO](https://github.com/openai/spinningup/blob/master/spinup/algos/tf1/ppo/ppo.py)
"""
"""
## Libraries
For this example the following libraries are used:
1. `numpy` for n-dimensional arrays
2. `tensorflow` and `keras` for building the deep RL PPO agent
3. `gymnasium` for getting everything we need about the environment
4. `scipy.signal` for calculating the discounted cumulative sums of vectors
"""
import os
os.environ["KERAS_BACKEND"] = "tensorflow"
import keras
from keras import layers
import numpy as np
import tensorflow as tf
import gymnasium as gym
import scipy.signal
"""
## Functions and class
"""
def discounted_cumulative_sums(x, discount):
return scipy.signal.lfilter([1], [1, float(-discount)], x[::-1], axis=0)[::-1]
class Buffer:
def __init__(self, observation_dimensions, size, gamma=0.99, lam=0.95):
self.observation_buffer = np.zeros(
(size, observation_dimensions), dtype=np.float32
)
self.action_buffer = np.zeros(size, dtype=np.int32)
self.advantage_buffer = np.zeros(size, dtype=np.float32)
self.reward_buffer = np.zeros(size, dtype=np.float32)
self.return_buffer = np.zeros(size, dtype=np.float32)
self.value_buffer = np.zeros(size, dtype=np.float32)
self.logprobability_buffer = np.zeros(size, dtype=np.float32)
self.gamma, self.lam = gamma, lam
self.pointer, self.trajectory_start_index = 0, 0
def store(self, observation, action, reward, value, logprobability):
self.observation_buffer[self.pointer] = observation
self.action_buffer[self.pointer] = action
self.reward_buffer[self.pointer] = reward
self.value_buffer[self.pointer] = value
self.logprobability_buffer[self.pointer] = logprobability
self.pointer += 1
def finish_trajectory(self, last_value=0):
path_slice = slice(self.trajectory_start_index, self.pointer)
rewards = np.append(self.reward_buffer[path_slice], last_value)
values = np.append(self.value_buffer[path_slice], last_value)
deltas = rewards[:-1] + self.gamma * values[1:] - values[:-1]
self.advantage_buffer[path_slice] = discounted_cumulative_sums(
deltas, self.gamma * self.lam
)
self.return_buffer[path_slice] = discounted_cumulative_sums(
rewards, self.gamma
)[:-1]
self.trajectory_start_index = self.pointer
def get(self):
self.pointer, self.trajectory_start_index = 0, 0
advantage_mean, advantage_std = (
np.mean(self.advantage_buffer),
np.std(self.advantage_buffer),
)
self.advantage_buffer = (self.advantage_buffer - advantage_mean) / advantage_std
return (
self.observation_buffer,
self.action_buffer,
self.advantage_buffer,
self.return_buffer,
self.logprobability_buffer,
)
def mlp(x, sizes, activation=keras.activations.tanh, output_activation=None):
for size in sizes[:-1]:
x = layers.Dense(units=size, activation=activation)(x)
return layers.Dense(units=sizes[-1], activation=output_activation)(x)
def logprobabilities(logits, a):
logprobabilities_all = keras.ops.log_softmax(logits)
logprobability = keras.ops.sum(
keras.ops.one_hot(a, num_actions) * logprobabilities_all, axis=1
)
return logprobability
seed_generator = keras.random.SeedGenerator(1337)
@tf.function
def sample_action(observation):
logits = actor(observation)
action = keras.ops.squeeze(
keras.random.categorical(logits, 1, seed=seed_generator), axis=1
)
return logits, action
@tf.function
def train_policy(
observation_buffer, action_buffer, logprobability_buffer, advantage_buffer
):
with tf.GradientTape() as tape:
ratio = keras.ops.exp(
logprobabilities(actor(observation_buffer), action_buffer)
- logprobability_buffer
)
min_advantage = keras.ops.where(
advantage_buffer > 0,
(1 + clip_ratio) * advantage_buffer,
(1 - clip_ratio) * advantage_buffer,
)
policy_loss = -keras.ops.mean(
keras.ops.minimum(ratio * advantage_buffer, min_advantage)
)
policy_grads = tape.gradient(policy_loss, actor.trainable_variables)
policy_optimizer.apply_gradients(zip(policy_grads, actor.trainable_variables))
kl = keras.ops.mean(
logprobability_buffer
- logprobabilities(actor(observation_buffer), action_buffer)
)
kl = keras.ops.sum(kl)
return kl
@tf.function
def train_value_function(observation_buffer, return_buffer):
with tf.GradientTape() as tape:
value_loss = keras.ops.mean((return_buffer - critic(observation_buffer)) ** 2)
value_grads = tape.gradient(value_loss, critic.trainable_variables)
value_optimizer.apply_gradients(zip(value_grads, critic.trainable_variables))
"""
## Hyperparameters
"""
steps_per_epoch = 4000
epochs = 30
gamma = 0.99
clip_ratio = 0.2
policy_learning_rate = 3e-4
value_function_learning_rate = 1e-3
train_policy_iterations = 80
train_value_iterations = 80
lam = 0.97
target_kl = 0.01
hidden_sizes = (64, 64)
render = False
"""
## Initializations
"""
env = gym.make("CartPole-v1")
observation_dimensions = env.observation_space.shape[0]
num_actions = env.action_space.n
buffer = Buffer(observation_dimensions, steps_per_epoch)
observation_input = keras.Input(shape=(observation_dimensions,), dtype="float32")
logits = mlp(observation_input, list(hidden_sizes) + [num_actions])
actor = keras.Model(inputs=observation_input, outputs=logits)
value = keras.ops.squeeze(mlp(observation_input, list(hidden_sizes) + [1]), axis=1)
critic = keras.Model(inputs=observation_input, outputs=value)
policy_optimizer = keras.optimizers.Adam(learning_rate=policy_learning_rate)
value_optimizer = keras.optimizers.Adam(learning_rate=value_function_learning_rate)
observation, _ = env.reset()
episode_return, episode_length = 0, 0
"""
## Train
"""
for epoch in range(epochs):
sum_return = 0
sum_length = 0
num_episodes = 0
for t in range(steps_per_epoch):
if render:
env.render()
observation = observation.reshape(1, -1)
logits, action = sample_action(observation)
observation_new, reward, done, _, _ = env.step(action[0].numpy())
episode_return += reward
episode_length += 1
value_t = critic(observation)
logprobability_t = logprobabilities(logits, action)
buffer.store(observation, action, reward, value_t, logprobability_t)
observation = observation_new
terminal = done
if terminal or (t == steps_per_epoch - 1):
last_value = 0 if done else critic(observation.reshape(1, -1))
buffer.finish_trajectory(last_value)
sum_return += episode_return
sum_length += episode_length
num_episodes += 1
observation, _ = env.reset()
episode_return, episode_length = 0, 0
(
observation_buffer,
action_buffer,
advantage_buffer,
return_buffer,
logprobability_buffer,
) = buffer.get()
for _ in range(train_policy_iterations):
kl = train_policy(
observation_buffer, action_buffer, logprobability_buffer, advantage_buffer
)
if kl > 1.5 * target_kl:
break
for _ in range(train_value_iterations):
train_value_function(observation_buffer, return_buffer)
print(
f" Epoch: {epoch + 1}. Mean Return: {sum_return / num_episodes}. Mean Length: {sum_length / num_episodes}"
)
"""
## Visualizations
Before training:

After 8 epochs of training:

After 20 epochs of training:

"""