CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutSign UpSign In
pytorch

Real-time collaboration for Jupyter Notebooks, Linux Terminals, LaTeX, VS Code, R IDE, and more,
all in one place.

GitHub Repository: pytorch/tutorials
Path: blob/main/intermediate_source/mario_rl_tutorial.py
Views: 712
1
# -*- coding: utf-8 -*-
2
"""
3
Train a Mario-playing RL Agent
4
===============================
5
6
**Authors:** `Yuansong Feng <https://github.com/YuansongFeng>`__, `Suraj Subramanian <https://github.com/suraj813>`__, `Howard Wang <https://github.com/hw26>`__, `Steven Guo <https://github.com/GuoYuzhang>`__.
7
8
9
This tutorial walks you through the fundamentals of Deep Reinforcement
10
Learning. At the end, you will implement an AI-powered Mario (using
11
`Double Deep Q-Networks <https://arxiv.org/pdf/1509.06461.pdf>`__) that
12
can play the game by itself.
13
14
Although no prior knowledge of RL is necessary for this tutorial, you
15
can familiarize yourself with these RL
16
`concepts <https://spinningup.openai.com/en/latest/spinningup/rl_intro.html>`__,
17
and have this handy
18
`cheatsheet <https://colab.research.google.com/drive/1eN33dPVtdPViiS1njTW_-r-IYCDTFU7N>`__
19
as your companion. The full code is available
20
`here <https://github.com/yuansongFeng/MadMario/>`__.
21
22
.. figure:: /_static/img/mario.gif
23
:alt: mario
24
25
"""
26
27
28
######################################################################
29
#
30
#
31
# .. code-block:: bash
32
#
33
# %%bash
34
# pip install gym-super-mario-bros==7.4.0
35
# pip install tensordict==0.3.0
36
# pip install torchrl==0.3.0
37
#
38
39
import torch
40
from torch import nn
41
from torchvision import transforms as T
42
from PIL import Image
43
import numpy as np
44
from pathlib import Path
45
from collections import deque
46
import random, datetime, os
47
48
# Gym is an OpenAI toolkit for RL
49
import gym
50
from gym.spaces import Box
51
from gym.wrappers import FrameStack
52
53
# NES Emulator for OpenAI Gym
54
from nes_py.wrappers import JoypadSpace
55
56
# Super Mario environment for OpenAI Gym
57
import gym_super_mario_bros
58
59
from tensordict import TensorDict
60
from torchrl.data import TensorDictReplayBuffer, LazyMemmapStorage
61
62
######################################################################
63
# RL Definitions
64
# """"""""""""""""""
65
#
66
# **Environment** The world that an agent interacts with and learns from.
67
#
68
# **Action** :math:`a` : How the Agent responds to the Environment. The
69
# set of all possible Actions is called *action-space*.
70
#
71
# **State** :math:`s` : The current characteristic of the Environment. The
72
# set of all possible States the Environment can be in is called
73
# *state-space*.
74
#
75
# **Reward** :math:`r` : Reward is the key feedback from Environment to
76
# Agent. It is what drives the Agent to learn and to change its future
77
# action. An aggregation of rewards over multiple time steps is called
78
# **Return**.
79
#
80
# **Optimal Action-Value function** :math:`Q^*(s,a)` : Gives the expected
81
# return if you start in state :math:`s`, take an arbitrary action
82
# :math:`a`, and then for each future time step take the action that
83
# maximizes returns. :math:`Q` can be said to stand for the “quality” of
84
# the action in a state. We try to approximate this function.
85
#
86
87
88
######################################################################
89
# Environment
90
# """"""""""""""""
91
#
92
# Initialize Environment
93
# ------------------------
94
#
95
# In Mario, the environment consists of tubes, mushrooms and other
96
# components.
97
#
98
# When Mario makes an action, the environment responds with the changed
99
# (next) state, reward and other info.
100
#
101
102
# Initialize Super Mario environment (in v0.26 change render mode to 'human' to see results on the screen)
103
if gym.__version__ < '0.26':
104
env = gym_super_mario_bros.make("SuperMarioBros-1-1-v0", new_step_api=True)
105
else:
106
env = gym_super_mario_bros.make("SuperMarioBros-1-1-v0", render_mode='rgb', apply_api_compatibility=True)
107
108
# Limit the action-space to
109
# 0. walk right
110
# 1. jump right
111
env = JoypadSpace(env, [["right"], ["right", "A"]])
112
113
env.reset()
114
next_state, reward, done, trunc, info = env.step(action=0)
115
print(f"{next_state.shape},\n {reward},\n {done},\n {info}")
116
117
118
######################################################################
119
# Preprocess Environment
120
# ------------------------
121
#
122
# Environment data is returned to the agent in ``next_state``. As you saw
123
# above, each state is represented by a ``[3, 240, 256]`` size array.
124
# Often that is more information than our agent needs; for instance,
125
# Mario’s actions do not depend on the color of the pipes or the sky!
126
#
127
# We use **Wrappers** to preprocess environment data before sending it to
128
# the agent.
129
#
130
# ``GrayScaleObservation`` is a common wrapper to transform an RGB image
131
# to grayscale; doing so reduces the size of the state representation
132
# without losing useful information. Now the size of each state:
133
# ``[1, 240, 256]``
134
#
135
# ``ResizeObservation`` downsamples each observation into a square image.
136
# New size: ``[1, 84, 84]``
137
#
138
# ``SkipFrame`` is a custom wrapper that inherits from ``gym.Wrapper`` and
139
# implements the ``step()`` function. Because consecutive frames don’t
140
# vary much, we can skip n-intermediate frames without losing much
141
# information. The n-th frame aggregates rewards accumulated over each
142
# skipped frame.
143
#
144
# ``FrameStack`` is a wrapper that allows us to squash consecutive frames
145
# of the environment into a single observation point to feed to our
146
# learning model. This way, we can identify if Mario was landing or
147
# jumping based on the direction of his movement in the previous several
148
# frames.
149
#
150
151
152
class SkipFrame(gym.Wrapper):
153
def __init__(self, env, skip):
154
"""Return only every `skip`-th frame"""
155
super().__init__(env)
156
self._skip = skip
157
158
def step(self, action):
159
"""Repeat action, and sum reward"""
160
total_reward = 0.0
161
for i in range(self._skip):
162
# Accumulate reward and repeat the same action
163
obs, reward, done, trunk, info = self.env.step(action)
164
total_reward += reward
165
if done:
166
break
167
return obs, total_reward, done, trunk, info
168
169
170
class GrayScaleObservation(gym.ObservationWrapper):
171
def __init__(self, env):
172
super().__init__(env)
173
obs_shape = self.observation_space.shape[:2]
174
self.observation_space = Box(low=0, high=255, shape=obs_shape, dtype=np.uint8)
175
176
def permute_orientation(self, observation):
177
# permute [H, W, C] array to [C, H, W] tensor
178
observation = np.transpose(observation, (2, 0, 1))
179
observation = torch.tensor(observation.copy(), dtype=torch.float)
180
return observation
181
182
def observation(self, observation):
183
observation = self.permute_orientation(observation)
184
transform = T.Grayscale()
185
observation = transform(observation)
186
return observation
187
188
189
class ResizeObservation(gym.ObservationWrapper):
190
def __init__(self, env, shape):
191
super().__init__(env)
192
if isinstance(shape, int):
193
self.shape = (shape, shape)
194
else:
195
self.shape = tuple(shape)
196
197
obs_shape = self.shape + self.observation_space.shape[2:]
198
self.observation_space = Box(low=0, high=255, shape=obs_shape, dtype=np.uint8)
199
200
def observation(self, observation):
201
transforms = T.Compose(
202
[T.Resize(self.shape, antialias=True), T.Normalize(0, 255)]
203
)
204
observation = transforms(observation).squeeze(0)
205
return observation
206
207
208
# Apply Wrappers to environment
209
env = SkipFrame(env, skip=4)
210
env = GrayScaleObservation(env)
211
env = ResizeObservation(env, shape=84)
212
if gym.__version__ < '0.26':
213
env = FrameStack(env, num_stack=4, new_step_api=True)
214
else:
215
env = FrameStack(env, num_stack=4)
216
217
218
######################################################################
219
# After applying the above wrappers to the environment, the final wrapped
220
# state consists of 4 gray-scaled consecutive frames stacked together, as
221
# shown above in the image on the left. Each time Mario makes an action,
222
# the environment responds with a state of this structure. The structure
223
# is represented by a 3-D array of size ``[4, 84, 84]``.
224
#
225
# .. figure:: /_static/img/mario_env.png
226
# :alt: picture
227
#
228
#
229
230
231
######################################################################
232
# Agent
233
# """""""""
234
#
235
# We create a class ``Mario`` to represent our agent in the game. Mario
236
# should be able to:
237
#
238
# - **Act** according to the optimal action policy based on the current
239
# state (of the environment).
240
#
241
# - **Remember** experiences. Experience = (current state, current
242
# action, reward, next state). Mario *caches* and later *recalls* his
243
# experiences to update his action policy.
244
#
245
# - **Learn** a better action policy over time
246
#
247
248
249
class Mario:
250
def __init__():
251
pass
252
253
def act(self, state):
254
"""Given a state, choose an epsilon-greedy action"""
255
pass
256
257
def cache(self, experience):
258
"""Add the experience to memory"""
259
pass
260
261
def recall(self):
262
"""Sample experiences from memory"""
263
pass
264
265
def learn(self):
266
"""Update online action value (Q) function with a batch of experiences"""
267
pass
268
269
270
######################################################################
271
# In the following sections, we will populate Mario’s parameters and
272
# define his functions.
273
#
274
275
276
######################################################################
277
# Act
278
# --------------
279
#
280
# For any given state, an agent can choose to do the most optimal action
281
# (**exploit**) or a random action (**explore**).
282
#
283
# Mario randomly explores with a chance of ``self.exploration_rate``; when
284
# he chooses to exploit, he relies on ``MarioNet`` (implemented in
285
# ``Learn`` section) to provide the most optimal action.
286
#
287
288
289
class Mario:
290
def __init__(self, state_dim, action_dim, save_dir):
291
self.state_dim = state_dim
292
self.action_dim = action_dim
293
self.save_dir = save_dir
294
295
self.device = "cuda" if torch.cuda.is_available() else "cpu"
296
297
# Mario's DNN to predict the most optimal action - we implement this in the Learn section
298
self.net = MarioNet(self.state_dim, self.action_dim).float()
299
self.net = self.net.to(device=self.device)
300
301
self.exploration_rate = 1
302
self.exploration_rate_decay = 0.99999975
303
self.exploration_rate_min = 0.1
304
self.curr_step = 0
305
306
self.save_every = 5e5 # no. of experiences between saving Mario Net
307
308
def act(self, state):
309
"""
310
Given a state, choose an epsilon-greedy action and update value of step.
311
312
Inputs:
313
state(``LazyFrame``): A single observation of the current state, dimension is (state_dim)
314
Outputs:
315
``action_idx`` (``int``): An integer representing which action Mario will perform
316
"""
317
# EXPLORE
318
if np.random.rand() < self.exploration_rate:
319
action_idx = np.random.randint(self.action_dim)
320
321
# EXPLOIT
322
else:
323
state = state[0].__array__() if isinstance(state, tuple) else state.__array__()
324
state = torch.tensor(state, device=self.device).unsqueeze(0)
325
action_values = self.net(state, model="online")
326
action_idx = torch.argmax(action_values, axis=1).item()
327
328
# decrease exploration_rate
329
self.exploration_rate *= self.exploration_rate_decay
330
self.exploration_rate = max(self.exploration_rate_min, self.exploration_rate)
331
332
# increment step
333
self.curr_step += 1
334
return action_idx
335
336
337
######################################################################
338
# Cache and Recall
339
# ----------------------
340
#
341
# These two functions serve as Mario’s “memory” process.
342
#
343
# ``cache()``: Each time Mario performs an action, he stores the
344
# ``experience`` to his memory. His experience includes the current
345
# *state*, *action* performed, *reward* from the action, the *next state*,
346
# and whether the game is *done*.
347
#
348
# ``recall()``: Mario randomly samples a batch of experiences from his
349
# memory, and uses that to learn the game.
350
#
351
352
353
class Mario(Mario): # subclassing for continuity
354
def __init__(self, state_dim, action_dim, save_dir):
355
super().__init__(state_dim, action_dim, save_dir)
356
self.memory = TensorDictReplayBuffer(storage=LazyMemmapStorage(100000, device=torch.device("cpu")))
357
self.batch_size = 32
358
359
def cache(self, state, next_state, action, reward, done):
360
"""
361
Store the experience to self.memory (replay buffer)
362
363
Inputs:
364
state (``LazyFrame``),
365
next_state (``LazyFrame``),
366
action (``int``),
367
reward (``float``),
368
done(``bool``))
369
"""
370
def first_if_tuple(x):
371
return x[0] if isinstance(x, tuple) else x
372
state = first_if_tuple(state).__array__()
373
next_state = first_if_tuple(next_state).__array__()
374
375
state = torch.tensor(state)
376
next_state = torch.tensor(next_state)
377
action = torch.tensor([action])
378
reward = torch.tensor([reward])
379
done = torch.tensor([done])
380
381
# self.memory.append((state, next_state, action, reward, done,))
382
self.memory.add(TensorDict({"state": state, "next_state": next_state, "action": action, "reward": reward, "done": done}, batch_size=[]))
383
384
def recall(self):
385
"""
386
Retrieve a batch of experiences from memory
387
"""
388
batch = self.memory.sample(self.batch_size).to(self.device)
389
state, next_state, action, reward, done = (batch.get(key) for key in ("state", "next_state", "action", "reward", "done"))
390
return state, next_state, action.squeeze(), reward.squeeze(), done.squeeze()
391
392
393
######################################################################
394
# Learn
395
# --------------
396
#
397
# Mario uses the `DDQN algorithm <https://arxiv.org/pdf/1509.06461>`__
398
# under the hood. DDQN uses two ConvNets - :math:`Q_{online}` and
399
# :math:`Q_{target}` - that independently approximate the optimal
400
# action-value function.
401
#
402
# In our implementation, we share feature generator ``features`` across
403
# :math:`Q_{online}` and :math:`Q_{target}`, but maintain separate FC
404
# classifiers for each. :math:`\theta_{target}` (the parameters of
405
# :math:`Q_{target}`) is frozen to prevent updating by backprop. Instead,
406
# it is periodically synced with :math:`\theta_{online}` (more on this
407
# later).
408
#
409
# Neural Network
410
# ~~~~~~~~~~~~~~~~~~
411
412
413
class MarioNet(nn.Module):
414
"""mini CNN structure
415
input -> (conv2d + relu) x 3 -> flatten -> (dense + relu) x 2 -> output
416
"""
417
418
def __init__(self, input_dim, output_dim):
419
super().__init__()
420
c, h, w = input_dim
421
422
if h != 84:
423
raise ValueError(f"Expecting input height: 84, got: {h}")
424
if w != 84:
425
raise ValueError(f"Expecting input width: 84, got: {w}")
426
427
self.online = self.__build_cnn(c, output_dim)
428
429
self.target = self.__build_cnn(c, output_dim)
430
self.target.load_state_dict(self.online.state_dict())
431
432
# Q_target parameters are frozen.
433
for p in self.target.parameters():
434
p.requires_grad = False
435
436
def forward(self, input, model):
437
if model == "online":
438
return self.online(input)
439
elif model == "target":
440
return self.target(input)
441
442
def __build_cnn(self, c, output_dim):
443
return nn.Sequential(
444
nn.Conv2d(in_channels=c, out_channels=32, kernel_size=8, stride=4),
445
nn.ReLU(),
446
nn.Conv2d(in_channels=32, out_channels=64, kernel_size=4, stride=2),
447
nn.ReLU(),
448
nn.Conv2d(in_channels=64, out_channels=64, kernel_size=3, stride=1),
449
nn.ReLU(),
450
nn.Flatten(),
451
nn.Linear(3136, 512),
452
nn.ReLU(),
453
nn.Linear(512, output_dim),
454
)
455
456
457
######################################################################
458
# TD Estimate & TD Target
459
# ~~~~~~~~~~~~~~~~~~~~~~~~~~
460
#
461
# Two values are involved in learning:
462
#
463
# **TD Estimate** - the predicted optimal :math:`Q^*` for a given state
464
# :math:`s`
465
#
466
# .. math::
467
#
468
#
469
# {TD}_e = Q_{online}^*(s,a)
470
#
471
# **TD Target** - aggregation of current reward and the estimated
472
# :math:`Q^*` in the next state :math:`s'`
473
#
474
# .. math::
475
#
476
#
477
# a' = argmax_{a} Q_{online}(s', a)
478
#
479
# .. math::
480
#
481
#
482
# {TD}_t = r + \gamma Q_{target}^*(s',a')
483
#
484
# Because we don’t know what next action :math:`a'` will be, we use the
485
# action :math:`a'` maximizes :math:`Q_{online}` in the next state
486
# :math:`s'`.
487
#
488
# Notice we use the
489
# `@torch.no_grad() <https://pytorch.org/docs/stable/generated/torch.no_grad.html#no-grad>`__
490
# decorator on ``td_target()`` to disable gradient calculations here
491
# (because we don’t need to backpropagate on :math:`\theta_{target}`).
492
#
493
494
495
class Mario(Mario):
496
def __init__(self, state_dim, action_dim, save_dir):
497
super().__init__(state_dim, action_dim, save_dir)
498
self.gamma = 0.9
499
500
def td_estimate(self, state, action):
501
current_Q = self.net(state, model="online")[
502
np.arange(0, self.batch_size), action
503
] # Q_online(s,a)
504
return current_Q
505
506
@torch.no_grad()
507
def td_target(self, reward, next_state, done):
508
next_state_Q = self.net(next_state, model="online")
509
best_action = torch.argmax(next_state_Q, axis=1)
510
next_Q = self.net(next_state, model="target")[
511
np.arange(0, self.batch_size), best_action
512
]
513
return (reward + (1 - done.float()) * self.gamma * next_Q).float()
514
515
516
######################################################################
517
# Updating the model
518
# ~~~~~~~~~~~~~~~~~~~~~~
519
#
520
# As Mario samples inputs from his replay buffer, we compute :math:`TD_t`
521
# and :math:`TD_e` and backpropagate this loss down :math:`Q_{online}` to
522
# update its parameters :math:`\theta_{online}` (:math:`\alpha` is the
523
# learning rate ``lr`` passed to the ``optimizer``)
524
#
525
# .. math::
526
#
527
#
528
# \theta_{online} \leftarrow \theta_{online} + \alpha \nabla(TD_e - TD_t)
529
#
530
# :math:`\theta_{target}` does not update through backpropagation.
531
# Instead, we periodically copy :math:`\theta_{online}` to
532
# :math:`\theta_{target}`
533
#
534
# .. math::
535
#
536
#
537
# \theta_{target} \leftarrow \theta_{online}
538
#
539
#
540
541
542
class Mario(Mario):
543
def __init__(self, state_dim, action_dim, save_dir):
544
super().__init__(state_dim, action_dim, save_dir)
545
self.optimizer = torch.optim.Adam(self.net.parameters(), lr=0.00025)
546
self.loss_fn = torch.nn.SmoothL1Loss()
547
548
def update_Q_online(self, td_estimate, td_target):
549
loss = self.loss_fn(td_estimate, td_target)
550
self.optimizer.zero_grad()
551
loss.backward()
552
self.optimizer.step()
553
return loss.item()
554
555
def sync_Q_target(self):
556
self.net.target.load_state_dict(self.net.online.state_dict())
557
558
559
######################################################################
560
# Save checkpoint
561
# ~~~~~~~~~~~~~~~~~~
562
#
563
564
565
class Mario(Mario):
566
def save(self):
567
save_path = (
568
self.save_dir / f"mario_net_{int(self.curr_step // self.save_every)}.chkpt"
569
)
570
torch.save(
571
dict(model=self.net.state_dict(), exploration_rate=self.exploration_rate),
572
save_path,
573
)
574
print(f"MarioNet saved to {save_path} at step {self.curr_step}")
575
576
577
######################################################################
578
# Putting it all together
579
# ~~~~~~~~~~~~~~~~~~~~~~~~~~
580
#
581
582
583
class Mario(Mario):
584
def __init__(self, state_dim, action_dim, save_dir):
585
super().__init__(state_dim, action_dim, save_dir)
586
self.burnin = 1e4 # min. experiences before training
587
self.learn_every = 3 # no. of experiences between updates to Q_online
588
self.sync_every = 1e4 # no. of experiences between Q_target & Q_online sync
589
590
def learn(self):
591
if self.curr_step % self.sync_every == 0:
592
self.sync_Q_target()
593
594
if self.curr_step % self.save_every == 0:
595
self.save()
596
597
if self.curr_step < self.burnin:
598
return None, None
599
600
if self.curr_step % self.learn_every != 0:
601
return None, None
602
603
# Sample from memory
604
state, next_state, action, reward, done = self.recall()
605
606
# Get TD Estimate
607
td_est = self.td_estimate(state, action)
608
609
# Get TD Target
610
td_tgt = self.td_target(reward, next_state, done)
611
612
# Backpropagate loss through Q_online
613
loss = self.update_Q_online(td_est, td_tgt)
614
615
return (td_est.mean().item(), loss)
616
617
618
######################################################################
619
# Logging
620
# --------------
621
#
622
623
import numpy as np
624
import time, datetime
625
import matplotlib.pyplot as plt
626
627
628
class MetricLogger:
629
def __init__(self, save_dir):
630
self.save_log = save_dir / "log"
631
with open(self.save_log, "w") as f:
632
f.write(
633
f"{'Episode':>8}{'Step':>8}{'Epsilon':>10}{'MeanReward':>15}"
634
f"{'MeanLength':>15}{'MeanLoss':>15}{'MeanQValue':>15}"
635
f"{'TimeDelta':>15}{'Time':>20}\n"
636
)
637
self.ep_rewards_plot = save_dir / "reward_plot.jpg"
638
self.ep_lengths_plot = save_dir / "length_plot.jpg"
639
self.ep_avg_losses_plot = save_dir / "loss_plot.jpg"
640
self.ep_avg_qs_plot = save_dir / "q_plot.jpg"
641
642
# History metrics
643
self.ep_rewards = []
644
self.ep_lengths = []
645
self.ep_avg_losses = []
646
self.ep_avg_qs = []
647
648
# Moving averages, added for every call to record()
649
self.moving_avg_ep_rewards = []
650
self.moving_avg_ep_lengths = []
651
self.moving_avg_ep_avg_losses = []
652
self.moving_avg_ep_avg_qs = []
653
654
# Current episode metric
655
self.init_episode()
656
657
# Timing
658
self.record_time = time.time()
659
660
def log_step(self, reward, loss, q):
661
self.curr_ep_reward += reward
662
self.curr_ep_length += 1
663
if loss:
664
self.curr_ep_loss += loss
665
self.curr_ep_q += q
666
self.curr_ep_loss_length += 1
667
668
def log_episode(self):
669
"Mark end of episode"
670
self.ep_rewards.append(self.curr_ep_reward)
671
self.ep_lengths.append(self.curr_ep_length)
672
if self.curr_ep_loss_length == 0:
673
ep_avg_loss = 0
674
ep_avg_q = 0
675
else:
676
ep_avg_loss = np.round(self.curr_ep_loss / self.curr_ep_loss_length, 5)
677
ep_avg_q = np.round(self.curr_ep_q / self.curr_ep_loss_length, 5)
678
self.ep_avg_losses.append(ep_avg_loss)
679
self.ep_avg_qs.append(ep_avg_q)
680
681
self.init_episode()
682
683
def init_episode(self):
684
self.curr_ep_reward = 0.0
685
self.curr_ep_length = 0
686
self.curr_ep_loss = 0.0
687
self.curr_ep_q = 0.0
688
self.curr_ep_loss_length = 0
689
690
def record(self, episode, epsilon, step):
691
mean_ep_reward = np.round(np.mean(self.ep_rewards[-100:]), 3)
692
mean_ep_length = np.round(np.mean(self.ep_lengths[-100:]), 3)
693
mean_ep_loss = np.round(np.mean(self.ep_avg_losses[-100:]), 3)
694
mean_ep_q = np.round(np.mean(self.ep_avg_qs[-100:]), 3)
695
self.moving_avg_ep_rewards.append(mean_ep_reward)
696
self.moving_avg_ep_lengths.append(mean_ep_length)
697
self.moving_avg_ep_avg_losses.append(mean_ep_loss)
698
self.moving_avg_ep_avg_qs.append(mean_ep_q)
699
700
last_record_time = self.record_time
701
self.record_time = time.time()
702
time_since_last_record = np.round(self.record_time - last_record_time, 3)
703
704
print(
705
f"Episode {episode} - "
706
f"Step {step} - "
707
f"Epsilon {epsilon} - "
708
f"Mean Reward {mean_ep_reward} - "
709
f"Mean Length {mean_ep_length} - "
710
f"Mean Loss {mean_ep_loss} - "
711
f"Mean Q Value {mean_ep_q} - "
712
f"Time Delta {time_since_last_record} - "
713
f"Time {datetime.datetime.now().strftime('%Y-%m-%dT%H:%M:%S')}"
714
)
715
716
with open(self.save_log, "a") as f:
717
f.write(
718
f"{episode:8d}{step:8d}{epsilon:10.3f}"
719
f"{mean_ep_reward:15.3f}{mean_ep_length:15.3f}{mean_ep_loss:15.3f}{mean_ep_q:15.3f}"
720
f"{time_since_last_record:15.3f}"
721
f"{datetime.datetime.now().strftime('%Y-%m-%dT%H:%M:%S'):>20}\n"
722
)
723
724
for metric in ["ep_lengths", "ep_avg_losses", "ep_avg_qs", "ep_rewards"]:
725
plt.clf()
726
plt.plot(getattr(self, f"moving_avg_{metric}"), label=f"moving_avg_{metric}")
727
plt.legend()
728
plt.savefig(getattr(self, f"{metric}_plot"))
729
730
731
######################################################################
732
# Let’s play!
733
# """""""""""""""
734
#
735
# In this example we run the training loop for 40 episodes, but for Mario to truly learn the ways of
736
# his world, we suggest running the loop for at least 40,000 episodes!
737
#
738
use_cuda = torch.cuda.is_available()
739
print(f"Using CUDA: {use_cuda}")
740
print()
741
742
save_dir = Path("checkpoints") / datetime.datetime.now().strftime("%Y-%m-%dT%H-%M-%S")
743
save_dir.mkdir(parents=True)
744
745
mario = Mario(state_dim=(4, 84, 84), action_dim=env.action_space.n, save_dir=save_dir)
746
747
logger = MetricLogger(save_dir)
748
749
episodes = 40
750
for e in range(episodes):
751
752
state = env.reset()
753
754
# Play the game!
755
while True:
756
757
# Run agent on the state
758
action = mario.act(state)
759
760
# Agent performs action
761
next_state, reward, done, trunc, info = env.step(action)
762
763
# Remember
764
mario.cache(state, next_state, action, reward, done)
765
766
# Learn
767
q, loss = mario.learn()
768
769
# Logging
770
logger.log_step(reward, loss, q)
771
772
# Update state
773
state = next_state
774
775
# Check if end of game
776
if done or info["flag_get"]:
777
break
778
779
logger.log_episode()
780
781
if (e % 20 == 0) or (e == episodes - 1):
782
logger.record(episode=e, epsilon=mario.exploration_rate, step=mario.curr_step)
783
784
785
######################################################################
786
# Conclusion
787
# """""""""""""""
788
#
789
# In this tutorial, we saw how we can use PyTorch to train a game-playing AI. You can use the same methods
790
# to train an AI to play any of the games at the `OpenAI gym <https://gym.openai.com/>`__. Hope you enjoyed this tutorial, feel free to reach us at
791
# `our github <https://github.com/yuansongFeng/MadMario/>`__!
792
793