CoCalc -- math.jl

GitHub Repository: trixi-framework/Trixi.jl
Path: blob/main/src/auxiliary/math.jl
²⁰⁵⁵ views
1
# By default, Julia/LLVM does not use fused multiply-add operations (FMAs).
2
# Since these FMAs can increase the performance of many numerical algorithms,
3
# we need to opt-in explicitly.
4
# See https://ranocha.de/blog/Optimizing_EC_Trixi for further details.
5
@muladd begin
6
#! format: noindent
7

8
const TRIXI_UUID = UUID("a7f1ee26-1774-49b1-8366-f1abc58fbfcb")
9

10
"""
11
    Trixi.set_threading_backend!(backend::Symbol; force = true)
12

13
Toggle and/or switch backend behavior used in multithreaded loops inside Trixi.jl.
14
The selected backend affects the behavior of Trixi.jl's [`@threaded`](@ref) macro, which is used
15
throughout the codebase for parallel loops. By default, Polyester.jl is enabled for
16
optimal performance, but switching backends can be useful for comparisons or debugging.
17

18
# Available backends
19
- `:polyester`: Uses the default [Polyester.jl](https://github.com/JuliaSIMD/Polyester.jl)
20
- `:static`: Uses Julia's built-in static thread scheduling via `Threads.@threads :static`
21
- `:serial`: Disables threading, executing loops serially
22
- `:kernelabstractions`: Preferentially use the [KernelAbstractions.jl](https://github.com/JuliaGPU/KernelAbstractions.jl)
23
  kernels written in Trixi.jl, falling back to `:static` execution.
24
"""
25
function set_threading_backend!(backend::Symbol = :polyester; force = true)
26
    valid_backends = (:polyester, :static, :serial, :kernelabstractions)
27
    if !(backend in valid_backends)
28
        throw(ArgumentError("Invalid threading backend: $(backend). Current options are: $(join(valid_backends, ", "))"))
29
    end
30
    set_preferences!(TRIXI_UUID, "backend" => string(backend), force = force)
31
    @info "Please restart Julia and reload Trixi.jl for the `backend` change to take effect"
32
end
33

34
"""
35
    Trixi.set_loop_vectorization!(toggle::Bool; force = true)
36

37
Toggle the usage of [LoopVectorization.jl](https://github.com/JuliaSIMD/LoopVectorization.jl).
38
By default, LoopVectorization.jl is enabled, but it can
39
be useful for performance comparisons to switch to the Julia core backend.
40

41
This does not fully disable LoopVectorization.jl,
42
but only its internal use as part of Trixi.jl.
43
"""
44
function set_loop_vectorization!(toggle::Bool; force = true)
45
    set_preferences!(TRIXI_UUID, "loop_vectorization" => toggle, force = force)
46
    @info "Please restart Julia and reload Trixi.jl for the `loop_vectorization` change to take effect"
47
end
48

49
"""
50
    Trixi.set_sqrt_type!(type; force = true)
51

52
Set the `type` of the square root function to be used in Trixi.jl.
53
The default is `"sqrt_Trixi_NaN"` which returns `NaN` for negative arguments
54
instead of throwing an error.
55
Alternatively, you can set `type` to `"sqrt_Base"` to use the Julia built-in `sqrt` function
56
which provides a stack-trace of the error which might come in handy when debugging code.
57
"""
58
function set_sqrt_type!(type; force = true)
59
    @assert type == "sqrt_Trixi_NaN"||type == "sqrt_Base" "Only allowed `sqrt` function types are `\"sqrt_Trixi_NaN\"` and `\"sqrt_Base\"`"
60
    set_preferences!(TRIXI_UUID, "sqrt" => type, force = force)
61
    @info "Please restart Julia and reload Trixi.jl for the `sqrt` computation change to take effect"
62
end
63

64
@static if _PREFERENCE_SQRT == "sqrt_Trixi_NaN"
65
    """
66
        Trixi.sqrt(x::Real)
67

68
    Custom square root function which returns `NaN` for negative arguments instead of throwing an error.
69
    This is required to ensure [correct results for multithreaded computations](https://github.com/trixi-framework/Trixi.jl/issues/1766)
70
    when using the [`Polyester` package](https://github.com/JuliaSIMD/Polyester.jl),
71
    i.e., using the `@batch` macro instead of the Julia built-in `@threads` macro, see [`@threaded`](@ref).
72

73
    We dispatch this function for `Float64, Float32, Float16` to the LLVM intrinsics
74
    `llvm.sqrt.f64`, `llvm.sqrt.f32`, `llvm.sqrt.f16` as for these the LLVM functions can be used out-of the box,
75
    i.e., they return `NaN` for negative arguments.
76
    In principle, one could also use the `sqrt_llvm` call, but for transparency and consistency with [`log`](@ref) we
77
    spell out the datatype-dependent functions here.
78
    For other types, such as integers or dual numbers required for algorithmic differentiation, we
79
    fall back to the Julia built-in `sqrt` function after a check for negative arguments.
80
    Since these cases are not performance critical, the check for negativity does not hurt here
81
    and can (as of now) even be optimized away by the compiler due to the implementation of `sqrt` in Julia.
82

83
    When debugging code, it might be useful to change the implementation of this function to redirect to
84
    the Julia built-in `sqrt` function, as this reports the exact place in code where the domain is violated
85
    in the stacktrace.
86

87
    See also [`Trixi.set_sqrt_type!`](@ref).
88
    """
89
    @inline sqrt(x::Real) = x < zero(x) ? oftype(x, NaN) : Base.sqrt(x)
90

91
    # For `sqrt` we could use the `sqrt_llvm` call, ...
92
    #@inline sqrt(x::Union{Float64, Float32, Float16}) = Base.sqrt_llvm(x)
93

94
    # ... but for transparency and consistency we use the direct LLVM calls here.
95
    @inline sqrt(x::Float64) = ccall("llvm.sqrt.f64", llvmcall, Float64, (Float64,), x)
96
    @inline sqrt(x::Float32) = ccall("llvm.sqrt.f32", llvmcall, Float32, (Float32,), x)
97
    @inline sqrt(x::Float16) = ccall("llvm.sqrt.f16", llvmcall, Float16, (Float16,), x)
98
end
99

100
"""
101
    Trixi.set_log_type!(type; force = true)
102

103
Set the `type` of the (natural) `log` function to be used in Trixi.jl.
104
The default is `"sqrt_Trixi_NaN"` which returns `NaN` for negative arguments
105
instead of throwing an error.
106
Alternatively, you can set `type` to `"sqrt_Base"` to use the Julia built-in `sqrt` function
107
which provides a stack-trace of the error which might come in handy when debugging code.
108
"""
109
function set_log_type!(type; force = true)
110
    @assert type == "log_Trixi_NaN"||type == "log_Base" "Only allowed log function types are `\"log_Trixi_NaN\"` and `\"log_Base\"`."
111
    set_preferences!(TRIXI_UUID, "log" => type, force = force)
112
    @info "Please restart Julia and reload Trixi.jl for the `log` computation change to take effect"
113
end
114

115
@static if _PREFERENCE_LOG == "log_Trixi_NaN"
116
    """
117
        Trixi.log(x::Real)
118

119
    Custom natural logarithm function which returns `NaN` for negative arguments instead of throwing an error.
120
    This is required to ensure [correct results for multithreaded computations](https://github.com/trixi-framework/Trixi.jl/issues/1766)
121
    when using the [`Polyester` package](https://github.com/JuliaSIMD/Polyester.jl),
122
    i.e., using the `@batch` macro instead of the Julia built-in `@threads` macro, see [`@threaded`](@ref).
123

124
    We dispatch this function for `Float64, Float32, Float16` to the respective LLVM intrinsics
125
    `llvm.log.f64`, `llvm.log.f32`, `llvm.log.f16` as for this the LLVM functions can be used out-of the box, i.e.,
126
    they return `NaN` for negative arguments.
127
    For other types, such as integers or dual numbers required for algorithmic differentiation, we
128
    fall back to the Julia built-in `log` function after a check for negative arguments.
129
    Since these cases are not performance critical, the check for negativity does not hurt here.
130

131
    When debugging code, it might be useful to change the implementation of this function to redirect to
132
    the Julia built-in `log` function, as this reports the exact place in code where the domain is violated
133
    in the stacktrace.
134

135
    See also [`Trixi.set_log_type!`](@ref).
136
    """
137
    @inline log(x::Real) = x < zero(x) ? oftype(x, NaN) : Base.log(x)
138

139
    @inline log(x::Float64) = ccall("llvm.log.f64", llvmcall, Float64, (Float64,), x)
140
    @inline log(x::Float32) = ccall("llvm.log.f32", llvmcall, Float32, (Float32,), x)
141
    @inline log(x::Float16) = ccall("llvm.log.f16", llvmcall, Float16, (Float16,), x)
142
end
143

144
"""
145
    Trixi.ln_mean(x::Real, y::Real)
146

147
Compute the logarithmic mean
148

149
    ln_mean(x, y) = (y - x) / (log(y) - log(x)) = (y - x) / log(y / x)
150

151
Problem: The formula above has a removable singularity at `x == y`. Thus,
152
some care must be taken to implement it correctly without problems or loss
153
of accuracy when `x ≈ y`. Here, we use the approach proposed by
154
Ismail and Roe (2009).
155
Set ξ = y / x. Then, we have
156

157
    (y - x) / log(y / x) = (x + y) / log(ξ) * (ξ - 1) / (ξ + 1)
158

159
Set f = (ξ - 1) / (ξ + 1) = (y - x) / (x + y). Then, we use the expansion
160

161
    log(ξ) = 2 * f * (1 + f^2 / 3 + f^4 / 5 + f^6 / 7) + O(ξ^9)
162

163
Inserting the first few terms of this expansion yields
164

165
    (y - x) / log(ξ) ≈ (x + y) * f / (2 * f * (1 + f^2 / 3 + f^4 / 5 + f^6 / 7))
166
                     = (x + y) / (2 + 2/3 * f^2 + 2/5 * f^4 + 2/7 * f^6)
167

168
Since divisions are usually more expensive on modern hardware than
169
multiplications (Agner Fog), we try to avoid computing two divisions. Thus,
170
we use
171

172
    f^2 = (y - x)^2 / (x + y)^2
173
        = (x * (x - 2 * y) + y * y) / (x * (x + 2 * y) + y * y)
174

175
Given ε = 1.0e-4, we use the following algorithm.
176

177
    if f^2 < ε
178
      # use the expansion above
179
    else
180
      # use the direct formula (y - x) / log(y / x)
181
    end
182

183
# References
184
- Ismail, Roe (2009).
185
  Affordable, entropy-consistent Euler flux functions II: Entropy production at shocks.
186
  [DOI: 10.1016/j.jcp.2009.04.021](https://doi.org/10.1016/j.jcp.2009.04.021)
187
- Agner Fog.
188
  Lists of instruction latencies, throughputs and micro-operation breakdowns
189
  for Intel, AMD, and VIA CPUs.
190
  [https://www.agner.org/optimize/instruction_tables.pdf](https://www.agner.org/optimize/instruction_tables.pdf)
191
"""
192
@inline ln_mean(x::Real, y::Real) = ln_mean(promote(x, y)...)
193

194
@inline function ln_mean(x::RealT, y::RealT) where {RealT <: Real}
195
    epsilon_f2 = convert(RealT, 1.0e-4)
196
    f2 = (x * (x - 2 * y) + y * y) / (x * (x + 2 * y) + y * y) # f2 = f^2
197
    if f2 < epsilon_f2
198
        return (x + y) / @evalpoly(f2,
199
                         2,
200
                         convert(RealT, 2 / 3),
201
                         convert(RealT, 2 / 5),
202
                         convert(RealT, 2 / 7))
203
    else
204
        return (y - x) / log(y / x)
205
    end
206
end
207

208
"""
209
    Trixi.inv_ln_mean(x::Real, y::Real)
210

211
Compute the inverse `1 / ln_mean(x, y)` of the logarithmic mean
212
[`ln_mean`](@ref).
213

214
This function may be used to increase performance where the inverse of the
215
logarithmic mean is needed, by replacing a (slow) division by a (fast)
216
multiplication.
217
"""
218
@inline inv_ln_mean(x::Real, y::Real) = inv_ln_mean(promote(x, y)...)
219

220
@inline function inv_ln_mean(x::RealT, y::RealT) where {RealT <: Real}
221
    epsilon_f2 = convert(RealT, 1.0e-4)
222
    f2 = (x * (x - 2 * y) + y * y) / (x * (x + 2 * y) + y * y) # f2 = f^2
223
    if f2 < epsilon_f2
224
        return @evalpoly(f2,
225
                         2,
226
                         convert(RealT, 2 / 3),
227
                         convert(RealT, 2 / 5),
228
                         convert(RealT, 2 / 7)) / (x + y)
229
    else
230
        return log(y / x) / (y - x)
231
    end
232
end
233

234
# `Base.max` and `Base.min` perform additional checks for signed zeros and `NaN`s
235
# which are not present in comparable functions in Fortran/C++. For example,
236
# ```julia
237
# julia> @code_native debuginfo=:none syntax=:intel max(1.0, 2.0)
238
#         .text
239
#         vmovq   rcx, xmm1
240
#         vmovq   rax, xmm0
241
#         vcmpordsd       xmm2, xmm0, xmm0
242
#         vblendvpd       xmm2, xmm0, xmm1, xmm2
243
#         vcmpordsd       xmm3, xmm1, xmm1
244
#         vblendvpd       xmm3, xmm1, xmm0, xmm3
245
#         vmovapd xmm4, xmm2
246
#         test    rcx, rcx
247
#         jns     L45
248
#         vmovapd xmm4, xmm3
249
# L45:
250
#         test    rax, rax
251
#         js      L54
252
#         vmovapd xmm4, xmm3
253
# L54:
254
#         vcmpltsd        xmm0, xmm0, xmm1
255
#         vblendvpd       xmm0, xmm4, xmm2, xmm0
256
#         ret
257
#         nop     word ptr cs:[rax + rax]
258
#
259
# julia> @code_native debuginfo=:none syntax=:intel min(1.0, 2.0)
260
#         .text
261
#         vmovq   rcx, xmm1
262
#         vmovq   rax, xmm0
263
#         vcmpordsd       xmm2, xmm0, xmm0
264
#         vblendvpd       xmm2, xmm0, xmm1, xmm2
265
#         vcmpordsd       xmm3, xmm1, xmm1
266
#         vblendvpd       xmm3, xmm1, xmm0, xmm3
267
#         vmovapd xmm4, xmm2
268
#         test    rcx, rcx
269
#         jns     L58
270
#         test    rax, rax
271
#         js      L67
272
# L46:
273
#         vcmpltsd        xmm0, xmm1, xmm0
274
#         vblendvpd       xmm0, xmm4, xmm2, xmm0
275
#         ret
276
# L58:
277
#         vmovapd xmm4, xmm3
278
#         test    rax, rax
279
#         jns     L46
280
# L67:
281
#         vmovapd xmm4, xmm3
282
#         vcmpltsd        xmm0, xmm1, xmm0
283
#         vblendvpd       xmm0, xmm4, xmm2, xmm0
284
#         ret
285
#         nop     word ptr cs:[rax + rax]
286
#         nop     dword ptr [rax]
287
# ```
288
# In contrast, we get the much simpler and faster version
289
# ```julia
290
# julia> @code_native debuginfo=:none syntax=:intel Base.FastMath.max_fast(1.0, 2.0)
291
#         .text
292
#         vmaxsd  xmm0, xmm1, xmm0
293
#         ret
294
#         nop     word ptr cs:[rax + rax]
295
#
296
# julia> @code_native debuginfo=:none syntax=:intel Base.FastMath.min_fast(1.0, 2.0)
297
#         .text
298
#         vminsd  xmm0, xmm0, xmm1
299
#         ret
300
#         nop     word ptr cs:[rax + rax]
301
# ```
302
# when using `@fastmath`, which we also get from
303
# [Fortran](https://godbolt.org/z/Yrsa1js7P)
304
# or [C++](https://godbolt.org/z/674G7Pccv).
305
#
306
# Note however that such a custom reimplementation can cause incompatibilities with other
307
# packages. Currently we are affected by an issue with MPI.jl on ARM, see
308
# https://github.com/trixi-framework/Trixi.jl/issues/1922
309
# The workaround is to resort to Base.min / Base.max when using MPI reductions.
310
"""
311
    Trixi.max(x, y, ...)
312

313
Return the maximum of the arguments. See also the `maximum` function to take
314
the maximum element from a collection.
315

316
This version in Trixi.jl is semantically equivalent to `Base.max` but may
317
be implemented differently. In particular, it may avoid potentially expensive
318
checks necessary in the presence of `NaN`s (or signed zeros).
319

320
# Examples
321

322
```jldoctest
323
julia> max(2, 5, 1)
324
5
325
```
326
"""
327
@inline max(args...) = @fastmath max(args...)
328

329
"""
330
    Trixi.min(x, y, ...)
331

332
Return the minimum of the arguments. See also the `minimum` function to take
333
the minimum element from a collection.
334

335
This version in Trixi.jl is semantically equivalent to `Base.min` but may
336
be implemented differently. In particular, it may avoid potentially expensive
337
checks necessary in the presence of `NaN`s (or signed zeros).
338

339
# Examples
340

341
```jldoctest
342
julia> min(2, 5, 1)
343
1
344
```
345
"""
346
@inline min(args...) = @fastmath min(args...)
347

348
"""
349
    Trixi.positive_part(x)
350

351
Return `x` if `x` is positive, else zero. In other words, return
352
`(x + abs(x)) / 2` for real numbers `x`.
353
"""
354
@inline function positive_part(x)
355
    return max(x, zero(x))
356
end
357

358
"""
359
    Trixi.negative_part(x)
360

361
Return `x` if `x` is negative, else zero. In other words, return
362
`(x - abs(x)) / 2` for real numbers `x`.
363
"""
364
@inline function negative_part(x)
365
    return min(x, zero(x))
366
end
367

368
"""
369
    Trixi.stolarsky_mean(x::Real, y:Real, gamma::Real)
370

371
Compute an instance of a weighted Stolarsky mean of the form
372

373
    stolarsky_mean(x, y, gamma) = (gamma - 1)/gamma * (y^gamma - x^gamma) / (y^(gamma-1) - x^(gamma-1))
374

375
where `gamma > 1`.
376

377
Problem: The formula above has a removable singularity at `x == y`. Thus,
378
some care must be taken to implement it correctly without problems or loss
379
of accuracy when `x ≈ y`. Here, we use the approach proposed by
380
Winters et al. (2020).
381
Set f = (y - x) / (y + x) and g = gamma (for compact notation).
382
Then, we use the expansions
383

384
    ((1+f)^g - (1-f)^g) / g = 2*f + (g-1)(g-2)/3 * f^3 + (g-1)(g-2)(g-3)(g-4)/60 * f^5 + O(f^7)
385

386
and
387

388
    ((1+f)^(g-1) - (1-f)^(g-1)) / (g-1) = 2*f + (g-2)(g-3)/3 * f^3 + (g-2)(g-3)(g-4)(g-5)/60 * f^5 + O(f^7)
389

390
Inserting the first few terms of these expansions and performing polynomial long division
391
we find that
392

393
    stolarsky_mean(x, y, gamma) ≈ (y + x) / 2 * (1 + (g-2)/3 * f^2 - (g+1)(g-2)(g-3)/45 * f^4 + (g+1)(g-2)(g-3)(2g(g-2)-9)/945 * f^6)
394

395
Since divisions are usually more expensive on modern hardware than
396
multiplications (Agner Fog), we try to avoid computing two divisions. Thus,
397
we use
398

399
    f^2 = (y - x)^2 / (x + y)^2
400
        = (x * (x - 2 * y) + y * y) / (x * (x + 2 * y) + y * y)
401

402
Given ε = 1.0e-4, we use the following algorithm.
403

404
    if f^2 < ε
405
      # use the expansion above
406
    else
407
      # use the direct formula (gamma - 1)/gamma * (y^gamma - x^gamma) / (y^(gamma-1) - x^(gamma-1))
408
    end
409

410
# References
411
- Andrew R. Winters, Christof Czernik, Moritz B. Schily & Gregor J. Gassner (2020)
412
  Entropy stable numerical approximations for the isothermal and polytropic
413
  Euler equations
414
  [DOI: 10.1007/s10543-019-00789-w](https://doi.org/10.1007/s10543-019-00789-w)
415
- Agner Fog.
416
  Lists of instruction latencies, throughputs and micro-operation breakdowns
417
  for Intel, AMD, and VIA CPUs.
418
  [https://www.agner.org/optimize/instruction_tables.pdf](https://www.agner.org/optimize/instruction_tables.pdf)
419
"""
420
@inline stolarsky_mean(x::Real, y::Real, gamma::Real) = stolarsky_mean(promote(x, y,
421
                                                                               gamma)...)
422

423
@inline function stolarsky_mean(x::RealT, y::RealT, gamma::RealT) where {RealT <: Real}
424
    epsilon_f2 = convert(RealT, 1.0e-4)
425
    f2 = (x * (x - 2 * y) + y * y) / (x * (x + 2 * y) + y * y) # f2 = f^2
426
    if f2 < epsilon_f2
427
        # convenience coefficients
428
        c1 = convert(RealT, 1 / 3) * (gamma - 2)
429
        c2 = convert(RealT, -1 / 15) * (gamma + 1) * (gamma - 3) * c1
430
        c3 = convert(RealT, -1 / 21) * (2 * gamma * (gamma - 2) - 9) * c2
431
        return 0.5f0 * (x + y) * @evalpoly(f2, 1, c1, c2, c3)
432
    else
433
        return (gamma - 1) / gamma * (y^gamma - x^gamma) /
434
               (y^(gamma - 1) - x^(gamma - 1))
435
    end
436
end
437
end # @muladd
438

439
Product

Resources

Company