CoCalc -- hyperparameter_tuning

GitHub Repository: pytorch/tutorials
Path: blob/main/beginner_source/hyperparameter_tuning_tutorial.py
²²³⁸ views
1
# -*- coding: utf-8 -*-
2
"""
3
Hyperparameter tuning with Ray Tune
4
===================================
5

6
Hyperparameter tuning can make the difference between an average model and a highly
7
accurate one. Often simple things like choosing a different learning rate or changing
8
a network layer size can have a dramatic impact on your model performance.
9

10
Fortunately, there are tools that help with finding the best combination of parameters.
11
`Ray Tune <https://docs.ray.io/en/latest/tune.html>`_ is an industry standard tool for
12
distributed hyperparameter tuning. Ray Tune includes the latest hyperparameter search
13
algorithms, integrates with various analysis libraries, and natively
14
supports distributed training through `Ray's distributed machine learning engine
15
<https://ray.io/>`_.
16

17
In this tutorial, we will show you how to integrate Ray Tune into your PyTorch
18
training workflow. We will extend `this tutorial from the PyTorch documentation
19
<https://pytorch.org/tutorials/beginner/blitz/cifar10_tutorial.html>`_ for training
20
a CIFAR10 image classifier.
21

22
As you will see, we only need to add some slight modifications. In particular, we
23
need to
24

25
1. wrap data loading and training in functions,
26
2. make some network parameters configurable,
27
3. add checkpointing (optional),
28
4. and define the search space for the model tuning
29

30
|
31

32
To run this tutorial, please make sure the following packages are
33
installed:
34

35
-  ``ray[tune]``: Distributed hyperparameter tuning library
36
-  ``torchvision``: For the data transformers
37

38
Setup / Imports
39
---------------
40
Let's start with the imports:
41
"""
42
from functools import partial
43
import os
44
import tempfile
45
from pathlib import Path
46
import torch
47
import torch.nn as nn
48
import torch.nn.functional as F
49
import torch.optim as optim
50
from torch.utils.data import random_split
51
import torchvision
52
import torchvision.transforms as transforms
53
# sphinx_gallery_start_ignore
54
# Fixes ``AttributeError: '_LoggingTee' object has no attribute 'fileno'``.
55
# This is only needed to run with sphinx-build.
56
import sys
57
if not hasattr(sys.stdout, "encoding"):
58
    sys.stdout.encoding = "latin1"
59
    sys.stdout.fileno = lambda: 0
60
# sphinx_gallery_end_ignore
61
from ray import tune
62
from ray import train
63
from ray.train import Checkpoint, get_checkpoint
64
from ray.tune.schedulers import ASHAScheduler
65
import ray.cloudpickle as pickle
66

67
######################################################################
68
# Most of the imports are needed for building the PyTorch model. Only the last 
69
# imports are for Ray Tune.
70
#
71
# Data loaders
72
# ------------
73
# We wrap the data loaders in their own function and pass a global data directory.
74
# This way we can share a data directory between different trials.
75

76

77
def load_data(data_dir="./data"):
78
    transform = transforms.Compose(
79
        [transforms.ToTensor(), transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))]
80
    )
81

82
    trainset = torchvision.datasets.CIFAR10(
83
        root=data_dir, train=True, download=True, transform=transform
84
    )
85

86
    testset = torchvision.datasets.CIFAR10(
87
        root=data_dir, train=False, download=True, transform=transform
88
    )
89

90
    return trainset, testset
91

92

93
######################################################################
94
# Configurable neural network
95
# ---------------------------
96
# We can only tune those parameters that are configurable.
97
# In this example, we can specify
98
# the layer sizes of the fully connected layers:
99

100

101
class Net(nn.Module):
102
    def __init__(self, l1=120, l2=84):
103
        super(Net, self).__init__()
104
        self.conv1 = nn.Conv2d(3, 6, 5)
105
        self.pool = nn.MaxPool2d(2, 2)
106
        self.conv2 = nn.Conv2d(6, 16, 5)
107
        self.fc1 = nn.Linear(16 * 5 * 5, l1)
108
        self.fc2 = nn.Linear(l1, l2)
109
        self.fc3 = nn.Linear(l2, 10)
110

111
    def forward(self, x):
112
        x = self.pool(F.relu(self.conv1(x)))
113
        x = self.pool(F.relu(self.conv2(x)))
114
        x = torch.flatten(x, 1)  # flatten all dimensions except batch
115
        x = F.relu(self.fc1(x))
116
        x = F.relu(self.fc2(x))
117
        x = self.fc3(x)
118
        return x
119

120

121
######################################################################
122
# The train function
123
# ------------------
124
# Now it gets interesting, because we introduce some changes to the example `from the PyTorch
125
# documentation <https://pytorch.org/tutorials/beginner/blitz/cifar10_tutorial.html>`_.
126
#
127
# We wrap the training script in a function ``train_cifar(config, data_dir=None)``.
128
# The ``config`` parameter will receive the hyperparameters we would like to
129
# train with. The ``data_dir`` specifies the directory where we load and store the data,
130
# so that multiple runs can share the same data source.
131
# We also load the model and optimizer state at the start of the run, if a checkpoint
132
# is provided. Further down in this tutorial you will find information on how
133
# to save the checkpoint and what it is used for.
134
#
135
# .. code-block:: python
136
#
137
#     net = Net(config["l1"], config["l2"])
138
#
139
#     checkpoint = get_checkpoint()
140
#     if checkpoint:
141
#         with checkpoint.as_directory() as checkpoint_dir:
142
#             data_path = Path(checkpoint_dir) / "data.pkl"
143
#             with open(data_path, "rb") as fp:
144
#                 checkpoint_state = pickle.load(fp)
145
#             start_epoch = checkpoint_state["epoch"]
146
#             net.load_state_dict(checkpoint_state["net_state_dict"])
147
#             optimizer.load_state_dict(checkpoint_state["optimizer_state_dict"])
148
#     else:
149
#         start_epoch = 0
150
#
151
# The learning rate of the optimizer is made configurable, too:
152
#
153
# .. code-block:: python
154
#
155
#     optimizer = optim.SGD(net.parameters(), lr=config["lr"], momentum=0.9)
156
#
157
# We also split the training data into a training and validation subset. We thus train on
158
# 80% of the data and calculate the validation loss on the remaining 20%. The batch sizes
159
# with which we iterate through the training and test sets are configurable as well.
160
#
161
# Adding (multi) GPU support with DataParallel
162
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
163
# Image classification benefits largely from GPUs. Luckily, we can continue to use
164
# PyTorch's abstractions in Ray Tune. Thus, we can wrap our model in ``nn.DataParallel``
165
# to support data parallel training on multiple GPUs:
166
#
167
# .. code-block:: python
168
#
169
#     device = "cpu"
170
#     if torch.cuda.is_available():
171
#         device = "cuda:0"
172
#         if torch.cuda.device_count() > 1:
173
#             net = nn.DataParallel(net)
174
#     net.to(device)
175
#
176
# By using a ``device`` variable we make sure that training also works when we have
177
# no GPUs available. PyTorch requires us to send our data to the GPU memory explicitly,
178
# like this:
179
#
180
# .. code-block:: python
181
#
182
#     for i, data in enumerate(trainloader, 0):
183
#         inputs, labels = data
184
#         inputs, labels = inputs.to(device), labels.to(device)
185
#
186
# The code now supports training on CPUs, on a single GPU, and on multiple GPUs. Notably, Ray
187
# also supports `fractional GPUs <https://docs.ray.io/en/latest/ray-core/scheduling/accelerators.html#fractional-accelerators>`_
188
# so we can share GPUs among trials, as long as the model still fits on the GPU memory. We'll come back
189
# to that later.
190
#
191
# Communicating with Ray Tune
192
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~
193
#
194
# The most interesting part is the communication with Ray Tune:
195
#
196
# .. code-block:: python
197
#
198
#     checkpoint_data = {
199
#         "epoch": epoch,
200
#         "net_state_dict": net.state_dict(),
201
#         "optimizer_state_dict": optimizer.state_dict(),
202
#     }
203
#     with tempfile.TemporaryDirectory() as checkpoint_dir:
204
#         data_path = Path(checkpoint_dir) / "data.pkl"
205
#         with open(data_path, "wb") as fp:
206
#             pickle.dump(checkpoint_data, fp)
207
#
208
#         checkpoint = Checkpoint.from_directory(checkpoint_dir)
209
#         train.report(
210
#             {"loss": val_loss / val_steps, "accuracy": correct / total},
211
#             checkpoint=checkpoint,
212
#         )
213
#
214
# Here we first save a checkpoint and then report some metrics back to Ray Tune. Specifically,
215
# we send the validation loss and accuracy back to Ray Tune. Ray Tune can then use these metrics
216
# to decide which hyperparameter configuration lead to the best results. These metrics
217
# can also be used to stop bad performing trials early in order to avoid wasting
218
# resources on those trials.
219
#
220
# The checkpoint saving is optional, however, it is necessary if we wanted to use advanced
221
# schedulers like
222
# `Population Based Training <https://docs.ray.io/en/latest/tune/examples/pbt_guide.html>`_.
223
# Also, by saving the checkpoint we can later load the trained models and validate them
224
# on a test set. Lastly, saving checkpoints is useful for fault tolerance, and it allows
225
# us to interrupt training and continue training later.
226
#
227
# Full training function
228
# ~~~~~~~~~~~~~~~~~~~~~~
229
#
230
# The full code example looks like this:
231

232

233
def train_cifar(config, data_dir=None):
234
    net = Net(config["l1"], config["l2"])
235

236
    device = "cpu"
237
    if torch.cuda.is_available():
238
        device = "cuda:0"
239
        if torch.cuda.device_count() > 1:
240
            net = nn.DataParallel(net)
241
    net.to(device)
242

243
    criterion = nn.CrossEntropyLoss()
244
    optimizer = optim.SGD(net.parameters(), lr=config["lr"], momentum=0.9)
245

246
    checkpoint = get_checkpoint()
247
    if checkpoint:
248
        with checkpoint.as_directory() as checkpoint_dir:
249
            data_path = Path(checkpoint_dir) / "data.pkl"
250
            with open(data_path, "rb") as fp:
251
                checkpoint_state = pickle.load(fp)
252
            start_epoch = checkpoint_state["epoch"]
253
            net.load_state_dict(checkpoint_state["net_state_dict"])
254
            optimizer.load_state_dict(checkpoint_state["optimizer_state_dict"])
255
    else:
256
        start_epoch = 0
257

258
    trainset, testset = load_data(data_dir)
259

260
    test_abs = int(len(trainset) * 0.8)
261
    train_subset, val_subset = random_split(
262
        trainset, [test_abs, len(trainset) - test_abs]
263
    )
264

265
    trainloader = torch.utils.data.DataLoader(
266
        train_subset, batch_size=int(config["batch_size"]), shuffle=True, num_workers=8
267
    )
268
    valloader = torch.utils.data.DataLoader(
269
        val_subset, batch_size=int(config["batch_size"]), shuffle=True, num_workers=8
270
    )
271

272
    for epoch in range(start_epoch, 10):  # loop over the dataset multiple times
273
        running_loss = 0.0
274
        epoch_steps = 0
275
        for i, data in enumerate(trainloader, 0):
276
            # get the inputs; data is a list of [inputs, labels]
277
            inputs, labels = data
278
            inputs, labels = inputs.to(device), labels.to(device)
279

280
            # zero the parameter gradients
281
            optimizer.zero_grad()
282

283
            # forward + backward + optimize
284
            outputs = net(inputs)
285
            loss = criterion(outputs, labels)
286
            loss.backward()
287
            optimizer.step()
288

289
            # print statistics
290
            running_loss += loss.item()
291
            epoch_steps += 1
292
            if i % 2000 == 1999:  # print every 2000 mini-batches
293
                print(
294
                    "[%d, %5d] loss: %.3f"
295
                    % (epoch + 1, i + 1, running_loss / epoch_steps)
296
                )
297
                running_loss = 0.0
298

299
        # Validation loss
300
        val_loss = 0.0
301
        val_steps = 0
302
        total = 0
303
        correct = 0
304
        for i, data in enumerate(valloader, 0):
305
            with torch.no_grad():
306
                inputs, labels = data
307
                inputs, labels = inputs.to(device), labels.to(device)
308

309
                outputs = net(inputs)
310
                _, predicted = torch.max(outputs.data, 1)
311
                total += labels.size(0)
312
                correct += (predicted == labels).sum().item()
313

314
                loss = criterion(outputs, labels)
315
                val_loss += loss.cpu().numpy()
316
                val_steps += 1
317

318
        checkpoint_data = {
319
            "epoch": epoch,
320
            "net_state_dict": net.state_dict(),
321
            "optimizer_state_dict": optimizer.state_dict(),
322
        }
323
        with tempfile.TemporaryDirectory() as checkpoint_dir:
324
            data_path = Path(checkpoint_dir) / "data.pkl"
325
            with open(data_path, "wb") as fp:
326
                pickle.dump(checkpoint_data, fp)
327

328
            checkpoint = Checkpoint.from_directory(checkpoint_dir)
329
            train.report(
330
                {"loss": val_loss / val_steps, "accuracy": correct / total},
331
                checkpoint=checkpoint,
332
            )
333
    
334
    print("Finished Training")
335

336

337
######################################################################
338
# As you can see, most of the code is adapted directly from the original example.
339
#
340
# Test set accuracy
341
# -----------------
342
# Commonly the performance of a machine learning model is tested on a hold-out test
343
# set with data that has not been used for training the model. We also wrap this in a
344
# function:
345

346

347
def test_accuracy(net, device="cpu"):
348
    trainset, testset = load_data()
349

350
    testloader = torch.utils.data.DataLoader(
351
        testset, batch_size=4, shuffle=False, num_workers=2
352
    )
353

354
    correct = 0
355
    total = 0
356
    with torch.no_grad():
357
        for data in testloader:
358
            images, labels = data
359
            images, labels = images.to(device), labels.to(device)
360
            outputs = net(images)
361
            _, predicted = torch.max(outputs.data, 1)
362
            total += labels.size(0)
363
            correct += (predicted == labels).sum().item()
364

365
    return correct / total
366

367

368
######################################################################
369
# The function also expects a ``device`` parameter, so we can do the
370
# test set validation on a GPU.
371
#
372
# Configuring the search space
373
# ----------------------------
374
# Lastly, we need to define Ray Tune's search space. Here is an example:
375
#
376
# .. code-block:: python
377
#
378
#     config = {
379
#         "l1": tune.choice([2 ** i for i in range(9)]),
380
#         "l2": tune.choice([2 ** i for i in range(9)]),
381
#         "lr": tune.loguniform(1e-4, 1e-1),
382
#         "batch_size": tune.choice([2, 4, 8, 16])
383
#     }
384
#
385
# The ``tune.choice()`` accepts a list of values that are uniformly sampled from.
386
# In this example, the ``l1`` and ``l2`` parameters
387
# should be powers of 2 between 4 and 256, so either 4, 8, 16, 32, 64, 128, or 256.
388
# The ``lr`` (learning rate) should be uniformly sampled between 0.0001 and 0.1. Lastly,
389
# the batch size is a choice between 2, 4, 8, and 16.
390
#
391
# At each trial, Ray Tune will now randomly sample a combination of parameters from these
392
# search spaces. It will then train a number of models in parallel and find the best
393
# performing one among these. We also use the ``ASHAScheduler`` which will terminate bad
394
# performing trials early.
395
#
396
# We wrap the ``train_cifar`` function with ``functools.partial`` to set the constant
397
# ``data_dir`` parameter. We can also tell Ray Tune what resources should be
398
# available for each trial:
399
#
400
# .. code-block:: python
401
#
402
#     gpus_per_trial = 2
403
#     # ...
404
#     result = tune.run(
405
#         partial(train_cifar, data_dir=data_dir),
406
#         resources_per_trial={"cpu": 8, "gpu": gpus_per_trial},
407
#         config=config,
408
#         num_samples=num_samples,
409
#         scheduler=scheduler,
410
#         checkpoint_at_end=True)
411
#
412
# You can specify the number of CPUs, which are then available e.g.
413
# to increase the ``num_workers`` of the PyTorch ``DataLoader`` instances. The selected
414
# number of GPUs are made visible to PyTorch in each trial. Trials do not have access to
415
# GPUs that haven't been requested for them - so you don't have to care about two trials
416
# using the same set of resources.
417
#
418
# Here we can also specify fractional GPUs, so something like ``gpus_per_trial=0.5`` is
419
# completely valid. The trials will then share GPUs among each other.
420
# You just have to make sure that the models still fit in the GPU memory.
421
#
422
# After training the models, we will find the best performing one and load the trained
423
# network from the checkpoint file. We then obtain the test set accuracy and report
424
# everything by printing.
425
#
426
# The full main function looks like this:
427

428

429
def main(num_samples=10, max_num_epochs=10, gpus_per_trial=2):
430
    data_dir = os.path.abspath("./data")
431
    load_data(data_dir)
432
    config = {
433
        "l1": tune.choice([2**i for i in range(9)]),
434
        "l2": tune.choice([2**i for i in range(9)]),
435
        "lr": tune.loguniform(1e-4, 1e-1),
436
        "batch_size": tune.choice([2, 4, 8, 16]),
437
    }
438
    scheduler = ASHAScheduler(
439
        metric="loss",
440
        mode="min",
441
        max_t=max_num_epochs,
442
        grace_period=1,
443
        reduction_factor=2,
444
    )
445
    result = tune.run(
446
        partial(train_cifar, data_dir=data_dir),
447
        resources_per_trial={"cpu": 2, "gpu": gpus_per_trial},
448
        config=config,
449
        num_samples=num_samples,
450
        scheduler=scheduler,
451
    )
452

453
    best_trial = result.get_best_trial("loss", "min", "last")
454
    print(f"Best trial config: {best_trial.config}")
455
    print(f"Best trial final validation loss: {best_trial.last_result['loss']}")
456
    print(f"Best trial final validation accuracy: {best_trial.last_result['accuracy']}")
457

458
    best_trained_model = Net(best_trial.config["l1"], best_trial.config["l2"])
459
    device = "cpu"
460
    if torch.cuda.is_available():
461
        device = "cuda:0"
462
        if gpus_per_trial > 1:
463
            best_trained_model = nn.DataParallel(best_trained_model)
464
    best_trained_model.to(device)
465

466
    best_checkpoint = result.get_best_checkpoint(trial=best_trial, metric="accuracy", mode="max")
467
    with best_checkpoint.as_directory() as checkpoint_dir:
468
        data_path = Path(checkpoint_dir) / "data.pkl"
469
        with open(data_path, "rb") as fp:
470
            best_checkpoint_data = pickle.load(fp)
471

472
        best_trained_model.load_state_dict(best_checkpoint_data["net_state_dict"])
473
        test_acc = test_accuracy(best_trained_model, device)
474
        print("Best trial test set accuracy: {}".format(test_acc))
475

476

477
if __name__ == "__main__":
478
    # You can change the number of GPUs per trial here:
479
    main(num_samples=10, max_num_epochs=10, gpus_per_trial=0)
480

481

482
######################################################################
483
# If you run the code, an example output could look like this:
484
#
485
# .. code-block:: sh
486
#
487
#     Number of trials: 10/10 (10 TERMINATED)
488
#     +-----+--------------+------+------+-------------+--------+---------+------------+
489
#     | ... |   batch_size |   l1 |   l2 |          lr |   iter |    loss |   accuracy |
490
#     |-----+--------------+------+------+-------------+--------+---------+------------|
491
#     | ... |            2 |    1 |  256 | 0.000668163 |      1 | 2.31479 |     0.0977 |
492
#     | ... |            4 |   64 |    8 | 0.0331514   |      1 | 2.31605 |     0.0983 |
493
#     | ... |            4 |    2 |    1 | 0.000150295 |      1 | 2.30755 |     0.1023 |
494
#     | ... |           16 |   32 |   32 | 0.0128248   |     10 | 1.66912 |     0.4391 |
495
#     | ... |            4 |    8 |  128 | 0.00464561  |      2 | 1.7316  |     0.3463 |
496
#     | ... |            8 |  256 |    8 | 0.00031556  |      1 | 2.19409 |     0.1736 |
497
#     | ... |            4 |   16 |  256 | 0.00574329  |      2 | 1.85679 |     0.3368 |
498
#     | ... |            8 |    2 |    2 | 0.00325652  |      1 | 2.30272 |     0.0984 |
499
#     | ... |            2 |    2 |    2 | 0.000342987 |      2 | 1.76044 |     0.292  |
500
#     | ... |            4 |   64 |   32 | 0.003734    |      8 | 1.53101 |     0.4761 |
501
#     +-----+--------------+------+------+-------------+--------+---------+------------+
502
#
503
#     Best trial config: {'l1': 64, 'l2': 32, 'lr': 0.0037339984519545164, 'batch_size': 4}
504
#     Best trial final validation loss: 1.5310075663924216
505
#     Best trial final validation accuracy: 0.4761
506
#     Best trial test set accuracy: 0.4737
507
#
508
# Most trials have been stopped early in order to avoid wasting resources.
509
# The best performing trial achieved a validation accuracy of about 47%, which could
510
# be confirmed on the test set.
511
#
512
# So that's it! You can now tune the parameters of your PyTorch models.
513

514
Product

Resources

Company