CoCalc -- timer_quick

GitHub Repository: pytorch/tutorials
Path: blob/main/recipes_source/recipes/timer_quick_start.py
¹³⁷² views
1
"""
2
Timer quick start
3
=================
4

5
In this tutorial, we're going to cover the primary APIs of
6
`torch.utils.benchmark.Timer`. The PyTorch Timer is based on the
7
`timeit.Timer <https://docs.python.org/3/library/timeit.html#timeit.Timer>`__
8
API, with several PyTorch specific modifications. Familiarity with the
9
builtin `Timer` class is not required for this tutorial, however we assume
10
that the reader is familiar with the fundamentals of performance work.
11

12
For a more comprehensive performance tuning tutorial, see
13
`PyTorch Benchmark <https://pytorch.org/tutorials/recipes/recipes/benchmark.html>`__.
14

15

16
**Contents:**
17
    1. `Defining a Timer <#defining-a-timer>`__
18
    2. `Wall time: Timer.blocked_autorange(...) <#wall-time-timer-blocked-autorange>`__
19
    3. `C++ snippets <#c-snippets>`__
20
    4. `Instruction counts: Timer.collect_callgrind(...) <#instruction-counts-timer-collect-callgrind>`__
21
    5. `Instruction counts: Delving deeper <#instruction-counts-delving-deeper>`__
22
    6. `A/B testing with Callgrind <#a-b-testing-with-callgrind>`__
23
    7. `Wrapping up <#wrapping-up>`__
24
    8. `Footnotes <#footnotes>`__
25
"""
26

27

28
###############################################################################
29
# 1. Defining a Timer
30
# ~~~~~~~~~~~~~~~~~~~
31
#
32
# A `Timer` serves as a task definition.
33
#
34

35
from torch.utils.benchmark import Timer
36

37
timer = Timer(
38
    # The computation which will be run in a loop and timed.
39
    stmt="x * y",
40

41
    # `setup` will be run before calling the measurement loop, and is used to
42
    # populate any state which is needed by `stmt`
43
    setup="""
44
        x = torch.ones((128,))
45
        y = torch.ones((128,))
46
    """,
47

48
    # Alternatively, ``globals`` can be used to pass variables from the outer scope.
49
    # 
50
    #    globals={
51
    #        "x": torch.ones((128,)),
52
    #        "y": torch.ones((128,)),
53
    #    },
54

55
    # Control the number of threads that PyTorch uses. (Default: 1)
56
    num_threads=1,
57
)
58

59
###############################################################################
60
# 2. Wall time: ``Timer.blocked_autorange(...)``
61
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
62
#
63
# This method will handle details such as picking a suitable number if repeats,
64
# fixing the number of threads, and providing a convenient representation of
65
# the results.
66
#
67

68
# Measurement objects store the results of multiple repeats, and provide
69
# various utility features.
70
from torch.utils.benchmark import Measurement
71

72
m: Measurement = timer.blocked_autorange(min_run_time=1)
73
print(m)
74

75
###############################################################################
76
# .. code-block:: none
77
#    :caption: **Snippet wall time.**
78
#
79
#         <torch.utils.benchmark.utils.common.Measurement object at 0x7f1929a38ed0>
80
#         x * y
81
#         setup:
82
#           x = torch.ones((128,))
83
#           y = torch.ones((128,))
84
#
85
#           Median: 2.34 us
86
#           IQR:    0.07 us (2.31 to 2.38)
87
#           424 measurements, 1000 runs per measurement, 1 thread
88
#
89

90
###############################################################################
91
# 3. C++ snippets
92
# ~~~~~~~~~~~~~~~
93
#
94

95
from torch.utils.benchmark import Language
96

97
cpp_timer = Timer(
98
    "x * y;",
99
    """
100
        auto x = torch::ones({128});
101
        auto y = torch::ones({128});
102
    """,
103
    language=Language.CPP,
104
)
105

106
print(cpp_timer.blocked_autorange(min_run_time=1))
107

108
###############################################################################
109
# .. code-block:: none
110
#    :caption: **C++ snippet wall time.**
111
#
112
#         <torch.utils.benchmark.utils.common.Measurement object at 0x7f192b019ed0>
113
#         x * y;
114
#         setup:
115
#           auto x = torch::ones({128});
116
#           auto y = torch::ones({128});
117
#
118
#           Median: 1.21 us
119
#           IQR:    0.03 us (1.20 to 1.23)
120
#           83 measurements, 10000 runs per measurement, 1 thread
121
#
122

123
###############################################################################
124
# Unsurprisingly, the C++ snippet is both faster and has lower variation.
125
#
126

127
###############################################################################
128
# 4. Instruction counts: ``Timer.collect_callgrind(...)``
129
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
130
#
131
# For deep dive investigations, ``Timer.collect_callgrind`` wraps
132
# `Callgrind <https://valgrind.org/docs/manual/cl-manual.html>`__ in order to
133
# collect instruction counts. These are useful as they offer fine grained and
134
# deterministic (or very low noise in the case of Python) insights into how a
135
# snippet is run.
136
#
137

138
from torch.utils.benchmark import CallgrindStats, FunctionCounts
139

140
stats: CallgrindStats = cpp_timer.collect_callgrind()
141
print(stats)
142

143
###############################################################################
144
# .. code-block:: none
145
#    :caption: **C++ Callgrind stats (summary)**
146
#
147
#         <torch.utils.benchmark.utils.valgrind_wrapper.timer_interface.CallgrindStats object at 0x7f1929a35850>
148
#         x * y;
149
#         setup:
150
#           auto x = torch::ones({128});
151
#           auto y = torch::ones({128});
152
#
153
#                                 All          Noisy symbols removed
154
#             Instructions:       563600                     563600
155
#             Baseline:                0                          0
156
#         100 runs per measurement, 1 thread
157
#
158

159
###############################################################################
160
# 5. Instruction counts: Delving deeper
161
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
162
#
163
# The string representation of ``CallgrindStats`` is similar to that of
164
# Measurement. `Noisy symbols` are a Python concept (removing calls in the
165
# CPython interpreter which are known to be noisy).
166
#
167
# For more detailed analysis, however, we will want to look at specific calls.
168
# ``CallgrindStats.stats()`` returns a ``FunctionCounts`` object to make this easier.
169
# Conceptually, ``FunctionCounts`` can be thought of as a tuple of pairs with some
170
# utility methods, where each pair is `(number of instructions, file path and
171
# function name)`.
172
#
173
# A note on paths:
174
#   One generally doesn't care about absolute path. For instance, the full path
175
#   and function name for a multiply call is something like:
176
#
177
# .. code-block:: sh
178
#
179
#    /the/prefix/to/your/pytorch/install/dir/pytorch/build/aten/src/ATen/core/TensorMethods.cpp:at::Tensor::mul(at::Tensor const&) const [/the/path/to/your/conda/install/miniconda3/envs/ab_ref/lib/python3.7/site-packages/torch/lib/libtorch_cpu.so]
180
#
181
#   when in reality, all of the information that we're interested in can be
182
#   represented in:
183
#
184
# .. code-block:: sh
185
#
186
#    build/aten/src/ATen/core/TensorMethods.cpp:at::Tensor::mul(at::Tensor const&) const
187
#
188
#   ``CallgrindStats.as_standardized()`` makes a best effort to strip low signal
189
#   portions of the file path, as well as the shared object and is generally
190
#   recommended.
191
#
192

193
inclusive_stats = stats.as_standardized().stats(inclusive=False)
194
print(inclusive_stats[:10])
195

196
###############################################################################
197
# .. code-block:: none
198
#    :caption: **C++ Callgrind stats (detailed)**
199
#
200
#         torch.utils.benchmark.utils.valgrind_wrapper.timer_interface.FunctionCounts object at 0x7f192a6dfd90>
201
#           47264  ???:_int_free
202
#           25963  ???:_int_malloc
203
#           19900  build/../aten/src/ATen/TensorIter ... (at::TensorIteratorConfig const&)
204
#           18000  ???:__tls_get_addr
205
#           13500  ???:malloc
206
#           11300  build/../c10/util/SmallVector.h:a ... (at::TensorIteratorConfig const&)
207
#           10345  ???:_int_memalign
208
#           10000  build/../aten/src/ATen/TensorIter ... (at::TensorIteratorConfig const&)
209
#            9200  ???:free
210
#            8000  build/../c10/util/SmallVector.h:a ... IteratorBase::get_strides() const
211
#
212
#         Total: 173472
213
#
214

215
###############################################################################
216
# That's still quite a lot to digest. Let's use the `FunctionCounts.transform`
217
# method to trim some of the function path, and discard the function called.
218
# When we do, the counts of any collisions (e.g. `foo.h:a()` and `foo.h:b()`
219
# will both map to `foo.h`) will be added together.
220
#
221

222
import os
223
import re
224

225
def group_by_file(fn_name: str):
226
    if fn_name.startswith("???"):
227
        fn_dir, fn_file = fn_name.split(":")[:2]
228
    else:
229
        fn_dir, fn_file = os.path.split(fn_name.split(":")[0])
230
        fn_dir = re.sub("^.*build/../", "", fn_dir)
231
        fn_dir = re.sub("^.*torch/", "torch/", fn_dir)
232

233
    return f"{fn_dir:<15} {fn_file}"
234

235
print(inclusive_stats.transform(group_by_file)[:10])
236

237
###############################################################################
238
# .. code-block:: none
239
#    :caption: **Callgrind stats (condensed)**
240
#
241
#         <torch.utils.benchmark.utils.valgrind_wrapper.timer_interface.FunctionCounts object at 0x7f192995d750>
242
#           118200  aten/src/ATen   TensorIterator.cpp
243
#            65000  c10/util        SmallVector.h
244
#            47264  ???             _int_free
245
#            25963  ???             _int_malloc
246
#            20900  c10/util        intrusive_ptr.h
247
#            18000  ???             __tls_get_addr
248
#            15900  c10/core        TensorImpl.h
249
#            15100  c10/core        CPUAllocator.cpp
250
#            13500  ???             malloc
251
#            12500  c10/core        TensorImpl.cpp
252
#
253
#         Total: 352327
254
#
255

256
###############################################################################
257
# 6. A/B testing with ``Callgrind``
258
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
259
#
260
# One of the most useful features of instruction counts is they allow fine
261
# grained comparison of computation, which is critical when analyzing
262
# performance.
263
#
264
# To see this in action, lets compare our multiplication of two size 128
265
# Tensors with a {128} x {1} multiplication, which will broadcast the second
266
# Tensor:
267
#   result = {a0 * b0, a1 * b0, ..., a127 * b0}
268
#
269

270
broadcasting_stats = Timer(
271
    "x * y;",
272
    """
273
        auto x = torch::ones({128});
274
        auto y = torch::ones({1});
275
    """,
276
    language=Language.CPP,
277
).collect_callgrind().as_standardized().stats(inclusive=False)
278

279
###############################################################################
280
# Often we want to A/B test two different environments. (e.g. testing a PR, or
281
# experimenting with compile flags.) This is quite simple, as ``CallgrindStats``,
282
# ``FunctionCounts``, and Measurement are all pickleable. Simply save measurements
283
# from each environment, and load them in a single process for analysis.
284
#
285

286
import pickle
287

288
# Let's round trip `broadcasting_stats` just to show that we can.
289
broadcasting_stats = pickle.loads(pickle.dumps(broadcasting_stats))
290

291

292
# And now to diff the two tasks:
293
delta = broadcasting_stats - inclusive_stats
294

295
def extract_fn_name(fn: str):
296
    """Trim everything except the function name."""
297
    fn = ":".join(fn.split(":")[1:])
298
    return re.sub(r"\(.+\)", "(...)", fn)
299

300
# We use `.transform` to make the diff readable:
301
print(delta.transform(extract_fn_name))
302

303

304
###############################################################################
305
# .. code-block:: none
306
#    :caption: **Instruction count delta**
307
#
308
#         <torch.utils.benchmark.utils.valgrind_wrapper.timer_interface.FunctionCounts object at 0x7f192995d750>
309
#             17600  at::TensorIteratorBase::compute_strides(...)
310
#             12700  at::TensorIteratorBase::allocate_or_resize_outputs()
311
#             10200  c10::SmallVectorImpl<long>::operator=(...)
312
#              7400  at::infer_size(...)
313
#              6200  at::TensorIteratorBase::invert_perm(...) const
314
#              6064  _int_free
315
#              5100  at::TensorIteratorBase::reorder_dimensions()
316
#              4300  malloc
317
#              4300  at::TensorIteratorBase::compatible_stride(...) const
318
#               ...
319
#               -28  _int_memalign
320
#              -100  c10::impl::check_tensor_options_and_extract_memory_format(...)
321
#              -300  __memcmp_avx2_movbe
322
#              -400  at::detail::empty_cpu(...)
323
#             -1100  at::TensorIteratorBase::numel() const
324
#             -1300  void at::native::(...)
325
#             -2400  c10::TensorImpl::is_contiguous(...) const
326
#             -6100  at::TensorIteratorBase::compute_fast_setup_type(...)
327
#            -22600  at::TensorIteratorBase::fast_set_up(...)
328
#
329
#         Total: 58091
330
#
331

332
###############################################################################
333
# So the broadcasting version takes an extra 580 instructions per call (recall
334
# that we're collecting 100 runs per sample), or about 10%. There are quite a
335
# few ``TensorIterator`` calls, so lets drill down to those. ``FunctionCounts.filter``
336
# makes this easy.
337
#
338

339
print(delta.transform(extract_fn_name).filter(lambda fn: "TensorIterator" in fn))
340

341
###############################################################################
342
# .. code-block:: none
343
#    :caption: **Instruction count delta (filter)**
344
#
345
#         <torch.utils.benchmark.utils.valgrind_wrapper.timer_interface.FunctionCounts object at 0x7f19299544d0>
346
#             17600  at::TensorIteratorBase::compute_strides(...)
347
#             12700  at::TensorIteratorBase::allocate_or_resize_outputs()
348
#              6200  at::TensorIteratorBase::invert_perm(...) const
349
#              5100  at::TensorIteratorBase::reorder_dimensions()
350
#              4300  at::TensorIteratorBase::compatible_stride(...) const
351
#              4000  at::TensorIteratorBase::compute_shape(...)
352
#              2300  at::TensorIteratorBase::coalesce_dimensions()
353
#              1600  at::TensorIteratorBase::build(...)
354
#             -1100  at::TensorIteratorBase::numel() const
355
#             -6100  at::TensorIteratorBase::compute_fast_setup_type(...)
356
#            -22600  at::TensorIteratorBase::fast_set_up(...)
357
#
358
#         Total: 24000
359
#
360

361
###############################################################################
362
# This makes plain what is going on: there is a fast path in ``TensorIterator``
363
# setup, but in the {128} x {1} case we miss it and have to do a more general
364
# analysis which is more expensive. The most prominent call omitted by the
365
# filter is `c10::SmallVectorImpl<long>::operator=(...)`, which is also part
366
# of the more general setup.
367
#
368

369
###############################################################################
370
# 7. Wrapping up
371
# ~~~~~~~~~~~~~~
372
#
373
# In summary, use `Timer.blocked_autorange` to collect wall times. If timing
374
# variation is too high, increase `min_run_time`, or move to C++ snippets if
375
# convenient.
376
#
377
# For fine grained analysis, use `Timer.collect_callgrind` to measure
378
# instruction counts and `FunctionCounts.(__add__ / __sub__ / transform / filter)`
379
# to slice-and-dice them.
380
#
381

382
###############################################################################
383
# 8. Footnotes
384
# ~~~~~~~~~~~~
385
#
386
#   - Implied ``import torch``
387
#       If `globals` does not contain "torch", Timer will automatically
388
#       populate it. This means that ``Timer("torch.empty(())")`` will work.
389
#       (Though other imports should be placed in `setup`,
390
#       e.g. ``Timer("np.zeros(())", "import numpy as np")``)
391
#
392
#   - ``REL_WITH_DEB_INFO``
393
#       In order to provide full information about the PyTorch internals which
394
#       are executed, ``Callgrind`` needs access to C++ debug symbols. This is
395
#       accomplished by setting ``REL_WITH_DEB_INFO=1`` when building PyTorch.
396
#       Otherwise function calls will be opaque. (The resultant ``CallgrindStats``
397
#       will warn if debug symbols are missing.)
398

399
Product

Resources

Company