Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
trixi-framework
GitHub Repository: trixi-framework/Trixi.jl
Path: blob/main/src/auxiliary/math.jl
2055 views
1
# By default, Julia/LLVM does not use fused multiply-add operations (FMAs).
2
# Since these FMAs can increase the performance of many numerical algorithms,
3
# we need to opt-in explicitly.
4
# See https://ranocha.de/blog/Optimizing_EC_Trixi for further details.
5
@muladd begin
6
#! format: noindent
7
8
const TRIXI_UUID = UUID("a7f1ee26-1774-49b1-8366-f1abc58fbfcb")
9
10
"""
11
Trixi.set_threading_backend!(backend::Symbol; force = true)
12
13
Toggle and/or switch backend behavior used in multithreaded loops inside Trixi.jl.
14
The selected backend affects the behavior of Trixi.jl's [`@threaded`](@ref) macro, which is used
15
throughout the codebase for parallel loops. By default, Polyester.jl is enabled for
16
optimal performance, but switching backends can be useful for comparisons or debugging.
17
18
# Available backends
19
- `:polyester`: Uses the default [Polyester.jl](https://github.com/JuliaSIMD/Polyester.jl)
20
- `:static`: Uses Julia's built-in static thread scheduling via `Threads.@threads :static`
21
- `:serial`: Disables threading, executing loops serially
22
- `:kernelabstractions`: Preferentially use the [KernelAbstractions.jl](https://github.com/JuliaGPU/KernelAbstractions.jl)
23
kernels written in Trixi.jl, falling back to `:static` execution.
24
"""
25
function set_threading_backend!(backend::Symbol = :polyester; force = true)
26
valid_backends = (:polyester, :static, :serial, :kernelabstractions)
27
if !(backend in valid_backends)
28
throw(ArgumentError("Invalid threading backend: $(backend). Current options are: $(join(valid_backends, ", "))"))
29
end
30
set_preferences!(TRIXI_UUID, "backend" => string(backend), force = force)
31
@info "Please restart Julia and reload Trixi.jl for the `backend` change to take effect"
32
end
33
34
"""
35
Trixi.set_loop_vectorization!(toggle::Bool; force = true)
36
37
Toggle the usage of [LoopVectorization.jl](https://github.com/JuliaSIMD/LoopVectorization.jl).
38
By default, LoopVectorization.jl is enabled, but it can
39
be useful for performance comparisons to switch to the Julia core backend.
40
41
This does not fully disable LoopVectorization.jl,
42
but only its internal use as part of Trixi.jl.
43
"""
44
function set_loop_vectorization!(toggle::Bool; force = true)
45
set_preferences!(TRIXI_UUID, "loop_vectorization" => toggle, force = force)
46
@info "Please restart Julia and reload Trixi.jl for the `loop_vectorization` change to take effect"
47
end
48
49
"""
50
Trixi.set_sqrt_type!(type; force = true)
51
52
Set the `type` of the square root function to be used in Trixi.jl.
53
The default is `"sqrt_Trixi_NaN"` which returns `NaN` for negative arguments
54
instead of throwing an error.
55
Alternatively, you can set `type` to `"sqrt_Base"` to use the Julia built-in `sqrt` function
56
which provides a stack-trace of the error which might come in handy when debugging code.
57
"""
58
function set_sqrt_type!(type; force = true)
59
@assert type == "sqrt_Trixi_NaN"||type == "sqrt_Base" "Only allowed `sqrt` function types are `\"sqrt_Trixi_NaN\"` and `\"sqrt_Base\"`"
60
set_preferences!(TRIXI_UUID, "sqrt" => type, force = force)
61
@info "Please restart Julia and reload Trixi.jl for the `sqrt` computation change to take effect"
62
end
63
64
@static if _PREFERENCE_SQRT == "sqrt_Trixi_NaN"
65
"""
66
Trixi.sqrt(x::Real)
67
68
Custom square root function which returns `NaN` for negative arguments instead of throwing an error.
69
This is required to ensure [correct results for multithreaded computations](https://github.com/trixi-framework/Trixi.jl/issues/1766)
70
when using the [`Polyester` package](https://github.com/JuliaSIMD/Polyester.jl),
71
i.e., using the `@batch` macro instead of the Julia built-in `@threads` macro, see [`@threaded`](@ref).
72
73
We dispatch this function for `Float64, Float32, Float16` to the LLVM intrinsics
74
`llvm.sqrt.f64`, `llvm.sqrt.f32`, `llvm.sqrt.f16` as for these the LLVM functions can be used out-of the box,
75
i.e., they return `NaN` for negative arguments.
76
In principle, one could also use the `sqrt_llvm` call, but for transparency and consistency with [`log`](@ref) we
77
spell out the datatype-dependent functions here.
78
For other types, such as integers or dual numbers required for algorithmic differentiation, we
79
fall back to the Julia built-in `sqrt` function after a check for negative arguments.
80
Since these cases are not performance critical, the check for negativity does not hurt here
81
and can (as of now) even be optimized away by the compiler due to the implementation of `sqrt` in Julia.
82
83
When debugging code, it might be useful to change the implementation of this function to redirect to
84
the Julia built-in `sqrt` function, as this reports the exact place in code where the domain is violated
85
in the stacktrace.
86
87
See also [`Trixi.set_sqrt_type!`](@ref).
88
"""
89
@inline sqrt(x::Real) = x < zero(x) ? oftype(x, NaN) : Base.sqrt(x)
90
91
# For `sqrt` we could use the `sqrt_llvm` call, ...
92
#@inline sqrt(x::Union{Float64, Float32, Float16}) = Base.sqrt_llvm(x)
93
94
# ... but for transparency and consistency we use the direct LLVM calls here.
95
@inline sqrt(x::Float64) = ccall("llvm.sqrt.f64", llvmcall, Float64, (Float64,), x)
96
@inline sqrt(x::Float32) = ccall("llvm.sqrt.f32", llvmcall, Float32, (Float32,), x)
97
@inline sqrt(x::Float16) = ccall("llvm.sqrt.f16", llvmcall, Float16, (Float16,), x)
98
end
99
100
"""
101
Trixi.set_log_type!(type; force = true)
102
103
Set the `type` of the (natural) `log` function to be used in Trixi.jl.
104
The default is `"sqrt_Trixi_NaN"` which returns `NaN` for negative arguments
105
instead of throwing an error.
106
Alternatively, you can set `type` to `"sqrt_Base"` to use the Julia built-in `sqrt` function
107
which provides a stack-trace of the error which might come in handy when debugging code.
108
"""
109
function set_log_type!(type; force = true)
110
@assert type == "log_Trixi_NaN"||type == "log_Base" "Only allowed log function types are `\"log_Trixi_NaN\"` and `\"log_Base\"`."
111
set_preferences!(TRIXI_UUID, "log" => type, force = force)
112
@info "Please restart Julia and reload Trixi.jl for the `log` computation change to take effect"
113
end
114
115
@static if _PREFERENCE_LOG == "log_Trixi_NaN"
116
"""
117
Trixi.log(x::Real)
118
119
Custom natural logarithm function which returns `NaN` for negative arguments instead of throwing an error.
120
This is required to ensure [correct results for multithreaded computations](https://github.com/trixi-framework/Trixi.jl/issues/1766)
121
when using the [`Polyester` package](https://github.com/JuliaSIMD/Polyester.jl),
122
i.e., using the `@batch` macro instead of the Julia built-in `@threads` macro, see [`@threaded`](@ref).
123
124
We dispatch this function for `Float64, Float32, Float16` to the respective LLVM intrinsics
125
`llvm.log.f64`, `llvm.log.f32`, `llvm.log.f16` as for this the LLVM functions can be used out-of the box, i.e.,
126
they return `NaN` for negative arguments.
127
For other types, such as integers or dual numbers required for algorithmic differentiation, we
128
fall back to the Julia built-in `log` function after a check for negative arguments.
129
Since these cases are not performance critical, the check for negativity does not hurt here.
130
131
When debugging code, it might be useful to change the implementation of this function to redirect to
132
the Julia built-in `log` function, as this reports the exact place in code where the domain is violated
133
in the stacktrace.
134
135
See also [`Trixi.set_log_type!`](@ref).
136
"""
137
@inline log(x::Real) = x < zero(x) ? oftype(x, NaN) : Base.log(x)
138
139
@inline log(x::Float64) = ccall("llvm.log.f64", llvmcall, Float64, (Float64,), x)
140
@inline log(x::Float32) = ccall("llvm.log.f32", llvmcall, Float32, (Float32,), x)
141
@inline log(x::Float16) = ccall("llvm.log.f16", llvmcall, Float16, (Float16,), x)
142
end
143
144
"""
145
Trixi.ln_mean(x::Real, y::Real)
146
147
Compute the logarithmic mean
148
149
ln_mean(x, y) = (y - x) / (log(y) - log(x)) = (y - x) / log(y / x)
150
151
Problem: The formula above has a removable singularity at `x == y`. Thus,
152
some care must be taken to implement it correctly without problems or loss
153
of accuracy when `x ≈ y`. Here, we use the approach proposed by
154
Ismail and Roe (2009).
155
Set ξ = y / x. Then, we have
156
157
(y - x) / log(y / x) = (x + y) / log(ξ) * (ξ - 1) / (ξ + 1)
158
159
Set f = (ξ - 1) / (ξ + 1) = (y - x) / (x + y). Then, we use the expansion
160
161
log(ξ) = 2 * f * (1 + f^2 / 3 + f^4 / 5 + f^6 / 7) + O(ξ^9)
162
163
Inserting the first few terms of this expansion yields
164
165
(y - x) / log(ξ) ≈ (x + y) * f / (2 * f * (1 + f^2 / 3 + f^4 / 5 + f^6 / 7))
166
= (x + y) / (2 + 2/3 * f^2 + 2/5 * f^4 + 2/7 * f^6)
167
168
Since divisions are usually more expensive on modern hardware than
169
multiplications (Agner Fog), we try to avoid computing two divisions. Thus,
170
we use
171
172
f^2 = (y - x)^2 / (x + y)^2
173
= (x * (x - 2 * y) + y * y) / (x * (x + 2 * y) + y * y)
174
175
Given ε = 1.0e-4, we use the following algorithm.
176
177
if f^2 < ε
178
# use the expansion above
179
else
180
# use the direct formula (y - x) / log(y / x)
181
end
182
183
# References
184
- Ismail, Roe (2009).
185
Affordable, entropy-consistent Euler flux functions II: Entropy production at shocks.
186
[DOI: 10.1016/j.jcp.2009.04.021](https://doi.org/10.1016/j.jcp.2009.04.021)
187
- Agner Fog.
188
Lists of instruction latencies, throughputs and micro-operation breakdowns
189
for Intel, AMD, and VIA CPUs.
190
[https://www.agner.org/optimize/instruction_tables.pdf](https://www.agner.org/optimize/instruction_tables.pdf)
191
"""
192
@inline ln_mean(x::Real, y::Real) = ln_mean(promote(x, y)...)
193
194
@inline function ln_mean(x::RealT, y::RealT) where {RealT <: Real}
195
epsilon_f2 = convert(RealT, 1.0e-4)
196
f2 = (x * (x - 2 * y) + y * y) / (x * (x + 2 * y) + y * y) # f2 = f^2
197
if f2 < epsilon_f2
198
return (x + y) / @evalpoly(f2,
199
2,
200
convert(RealT, 2 / 3),
201
convert(RealT, 2 / 5),
202
convert(RealT, 2 / 7))
203
else
204
return (y - x) / log(y / x)
205
end
206
end
207
208
"""
209
Trixi.inv_ln_mean(x::Real, y::Real)
210
211
Compute the inverse `1 / ln_mean(x, y)` of the logarithmic mean
212
[`ln_mean`](@ref).
213
214
This function may be used to increase performance where the inverse of the
215
logarithmic mean is needed, by replacing a (slow) division by a (fast)
216
multiplication.
217
"""
218
@inline inv_ln_mean(x::Real, y::Real) = inv_ln_mean(promote(x, y)...)
219
220
@inline function inv_ln_mean(x::RealT, y::RealT) where {RealT <: Real}
221
epsilon_f2 = convert(RealT, 1.0e-4)
222
f2 = (x * (x - 2 * y) + y * y) / (x * (x + 2 * y) + y * y) # f2 = f^2
223
if f2 < epsilon_f2
224
return @evalpoly(f2,
225
2,
226
convert(RealT, 2 / 3),
227
convert(RealT, 2 / 5),
228
convert(RealT, 2 / 7)) / (x + y)
229
else
230
return log(y / x) / (y - x)
231
end
232
end
233
234
# `Base.max` and `Base.min` perform additional checks for signed zeros and `NaN`s
235
# which are not present in comparable functions in Fortran/C++. For example,
236
# ```julia
237
# julia> @code_native debuginfo=:none syntax=:intel max(1.0, 2.0)
238
# .text
239
# vmovq rcx, xmm1
240
# vmovq rax, xmm0
241
# vcmpordsd xmm2, xmm0, xmm0
242
# vblendvpd xmm2, xmm0, xmm1, xmm2
243
# vcmpordsd xmm3, xmm1, xmm1
244
# vblendvpd xmm3, xmm1, xmm0, xmm3
245
# vmovapd xmm4, xmm2
246
# test rcx, rcx
247
# jns L45
248
# vmovapd xmm4, xmm3
249
# L45:
250
# test rax, rax
251
# js L54
252
# vmovapd xmm4, xmm3
253
# L54:
254
# vcmpltsd xmm0, xmm0, xmm1
255
# vblendvpd xmm0, xmm4, xmm2, xmm0
256
# ret
257
# nop word ptr cs:[rax + rax]
258
#
259
# julia> @code_native debuginfo=:none syntax=:intel min(1.0, 2.0)
260
# .text
261
# vmovq rcx, xmm1
262
# vmovq rax, xmm0
263
# vcmpordsd xmm2, xmm0, xmm0
264
# vblendvpd xmm2, xmm0, xmm1, xmm2
265
# vcmpordsd xmm3, xmm1, xmm1
266
# vblendvpd xmm3, xmm1, xmm0, xmm3
267
# vmovapd xmm4, xmm2
268
# test rcx, rcx
269
# jns L58
270
# test rax, rax
271
# js L67
272
# L46:
273
# vcmpltsd xmm0, xmm1, xmm0
274
# vblendvpd xmm0, xmm4, xmm2, xmm0
275
# ret
276
# L58:
277
# vmovapd xmm4, xmm3
278
# test rax, rax
279
# jns L46
280
# L67:
281
# vmovapd xmm4, xmm3
282
# vcmpltsd xmm0, xmm1, xmm0
283
# vblendvpd xmm0, xmm4, xmm2, xmm0
284
# ret
285
# nop word ptr cs:[rax + rax]
286
# nop dword ptr [rax]
287
# ```
288
# In contrast, we get the much simpler and faster version
289
# ```julia
290
# julia> @code_native debuginfo=:none syntax=:intel Base.FastMath.max_fast(1.0, 2.0)
291
# .text
292
# vmaxsd xmm0, xmm1, xmm0
293
# ret
294
# nop word ptr cs:[rax + rax]
295
#
296
# julia> @code_native debuginfo=:none syntax=:intel Base.FastMath.min_fast(1.0, 2.0)
297
# .text
298
# vminsd xmm0, xmm0, xmm1
299
# ret
300
# nop word ptr cs:[rax + rax]
301
# ```
302
# when using `@fastmath`, which we also get from
303
# [Fortran](https://godbolt.org/z/Yrsa1js7P)
304
# or [C++](https://godbolt.org/z/674G7Pccv).
305
#
306
# Note however that such a custom reimplementation can cause incompatibilities with other
307
# packages. Currently we are affected by an issue with MPI.jl on ARM, see
308
# https://github.com/trixi-framework/Trixi.jl/issues/1922
309
# The workaround is to resort to Base.min / Base.max when using MPI reductions.
310
"""
311
Trixi.max(x, y, ...)
312
313
Return the maximum of the arguments. See also the `maximum` function to take
314
the maximum element from a collection.
315
316
This version in Trixi.jl is semantically equivalent to `Base.max` but may
317
be implemented differently. In particular, it may avoid potentially expensive
318
checks necessary in the presence of `NaN`s (or signed zeros).
319
320
# Examples
321
322
```jldoctest
323
julia> max(2, 5, 1)
324
5
325
```
326
"""
327
@inline max(args...) = @fastmath max(args...)
328
329
"""
330
Trixi.min(x, y, ...)
331
332
Return the minimum of the arguments. See also the `minimum` function to take
333
the minimum element from a collection.
334
335
This version in Trixi.jl is semantically equivalent to `Base.min` but may
336
be implemented differently. In particular, it may avoid potentially expensive
337
checks necessary in the presence of `NaN`s (or signed zeros).
338
339
# Examples
340
341
```jldoctest
342
julia> min(2, 5, 1)
343
1
344
```
345
"""
346
@inline min(args...) = @fastmath min(args...)
347
348
"""
349
Trixi.positive_part(x)
350
351
Return `x` if `x` is positive, else zero. In other words, return
352
`(x + abs(x)) / 2` for real numbers `x`.
353
"""
354
@inline function positive_part(x)
355
return max(x, zero(x))
356
end
357
358
"""
359
Trixi.negative_part(x)
360
361
Return `x` if `x` is negative, else zero. In other words, return
362
`(x - abs(x)) / 2` for real numbers `x`.
363
"""
364
@inline function negative_part(x)
365
return min(x, zero(x))
366
end
367
368
"""
369
Trixi.stolarsky_mean(x::Real, y:Real, gamma::Real)
370
371
Compute an instance of a weighted Stolarsky mean of the form
372
373
stolarsky_mean(x, y, gamma) = (gamma - 1)/gamma * (y^gamma - x^gamma) / (y^(gamma-1) - x^(gamma-1))
374
375
where `gamma > 1`.
376
377
Problem: The formula above has a removable singularity at `x == y`. Thus,
378
some care must be taken to implement it correctly without problems or loss
379
of accuracy when `x ≈ y`. Here, we use the approach proposed by
380
Winters et al. (2020).
381
Set f = (y - x) / (y + x) and g = gamma (for compact notation).
382
Then, we use the expansions
383
384
((1+f)^g - (1-f)^g) / g = 2*f + (g-1)(g-2)/3 * f^3 + (g-1)(g-2)(g-3)(g-4)/60 * f^5 + O(f^7)
385
386
and
387
388
((1+f)^(g-1) - (1-f)^(g-1)) / (g-1) = 2*f + (g-2)(g-3)/3 * f^3 + (g-2)(g-3)(g-4)(g-5)/60 * f^5 + O(f^7)
389
390
Inserting the first few terms of these expansions and performing polynomial long division
391
we find that
392
393
stolarsky_mean(x, y, gamma) ≈ (y + x) / 2 * (1 + (g-2)/3 * f^2 - (g+1)(g-2)(g-3)/45 * f^4 + (g+1)(g-2)(g-3)(2g(g-2)-9)/945 * f^6)
394
395
Since divisions are usually more expensive on modern hardware than
396
multiplications (Agner Fog), we try to avoid computing two divisions. Thus,
397
we use
398
399
f^2 = (y - x)^2 / (x + y)^2
400
= (x * (x - 2 * y) + y * y) / (x * (x + 2 * y) + y * y)
401
402
Given ε = 1.0e-4, we use the following algorithm.
403
404
if f^2 < ε
405
# use the expansion above
406
else
407
# use the direct formula (gamma - 1)/gamma * (y^gamma - x^gamma) / (y^(gamma-1) - x^(gamma-1))
408
end
409
410
# References
411
- Andrew R. Winters, Christof Czernik, Moritz B. Schily & Gregor J. Gassner (2020)
412
Entropy stable numerical approximations for the isothermal and polytropic
413
Euler equations
414
[DOI: 10.1007/s10543-019-00789-w](https://doi.org/10.1007/s10543-019-00789-w)
415
- Agner Fog.
416
Lists of instruction latencies, throughputs and micro-operation breakdowns
417
for Intel, AMD, and VIA CPUs.
418
[https://www.agner.org/optimize/instruction_tables.pdf](https://www.agner.org/optimize/instruction_tables.pdf)
419
"""
420
@inline stolarsky_mean(x::Real, y::Real, gamma::Real) = stolarsky_mean(promote(x, y,
421
gamma)...)
422
423
@inline function stolarsky_mean(x::RealT, y::RealT, gamma::RealT) where {RealT <: Real}
424
epsilon_f2 = convert(RealT, 1.0e-4)
425
f2 = (x * (x - 2 * y) + y * y) / (x * (x + 2 * y) + y * y) # f2 = f^2
426
if f2 < epsilon_f2
427
# convenience coefficients
428
c1 = convert(RealT, 1 / 3) * (gamma - 2)
429
c2 = convert(RealT, -1 / 15) * (gamma + 1) * (gamma - 3) * c1
430
c3 = convert(RealT, -1 / 21) * (2 * gamma * (gamma - 2) - 9) * c2
431
return 0.5f0 * (x + y) * @evalpoly(f2, 1, c1, c2, c3)
432
else
433
return (gamma - 1) / gamma * (y^gamma - x^gamma) /
434
(y^(gamma - 1) - x^(gamma - 1))
435
end
436
end
437
end # @muladd
438
439