Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
trixi-framework
GitHub Repository: trixi-framework/Trixi.jl
Path: blob/main/benchmark/multiply_dimensionwise/benchmark_multiply_dimensionwise.jl
2055 views
1
# Disable formatting this file since it contains highly unusual formatting for better
2
# readability
3
#! format: off
4
5
import Pkg; Pkg.activate(@__DIR__); Pkg.instantiate()
6
7
using BenchmarkTools
8
using BSON
9
using LoopVectorization
10
using MuladdMacro
11
using StaticArrays
12
using Tullio
13
14
###################################################################################################
15
# 2D versions
16
function multiply_dimensionwise_sequential!(
17
data_out::AbstractArray{<:Any, 3}, data_in::AbstractArray{<:Any, 3}, vandermonde,
18
tmp1=zeros(eltype(data_out), size(data_out, 1), size(vandermonde, 1), size(vandermonde, 2)))
19
n_vars = size(data_out, 1)
20
n_nodes_out = size(vandermonde, 1)
21
n_nodes_in = size(vandermonde, 2)
22
data_out .= zero(eltype(data_out))
23
24
@boundscheck begin
25
inbounds = (size(data_out, 2) == size(data_out, 3) == n_nodes_out) &&
26
(size(data_in, 1) == n_vars) &&
27
(size(data_in, 2) == size(data_in, 3) == n_nodes_in)
28
inbounds || throw(BoundsError())
29
end
30
31
# Interpolate in x-direction
32
@inbounds for j in 1:n_nodes_in, i in 1:n_nodes_out
33
for ii in 1:n_nodes_in
34
for v in 1:n_vars
35
tmp1[v, i, j] += vandermonde[i, ii] * data_in[v, ii, j]
36
end
37
end
38
end
39
40
# Interpolate in y-direction
41
@inbounds for j in 1:n_nodes_out, i in 1:n_nodes_out
42
for jj in 1:n_nodes_in
43
for v in 1:n_vars
44
data_out[v, i, j] += vandermonde[j, jj] * tmp1[v, i, jj]
45
end
46
end
47
end
48
49
return data_out
50
end
51
52
function multiply_dimensionwise_sequential_avx!(
53
data_out::AbstractArray{<:Any, 3}, data_in::AbstractArray{<:Any, 3}, vandermonde,
54
tmp1=zeros(eltype(data_out), size(data_out, 1), size(vandermonde, 1), size(vandermonde, 2)))
55
n_vars = size(data_out, 1)
56
n_nodes_out = size(vandermonde, 1)
57
n_nodes_in = size(vandermonde, 2)
58
data_out .= zero(eltype(data_out))
59
60
@boundscheck begin
61
inbounds = (size(data_out, 2) == size(data_out, 3) == n_nodes_out) &&
62
(size(data_in, 1) == n_vars) &&
63
(size(data_in, 2) == size(data_in, 3) == n_nodes_in)
64
inbounds || throw(BoundsError())
65
end
66
67
# Interpolate in x-direction
68
@avx for j in 1:n_nodes_in, i in 1:n_nodes_out
69
for ii in 1:n_nodes_in
70
for v in 1:n_vars
71
tmp1[v, i, j] += vandermonde[i, ii] * data_in[v, ii, j]
72
end
73
end
74
end
75
76
# Interpolate in y-direction
77
@avx for j in 1:n_nodes_out, i in 1:n_nodes_out
78
for jj in 1:n_nodes_in
79
for v in 1:n_vars
80
data_out[v, i, j] += vandermonde[j, jj] * tmp1[v, i, jj]
81
end
82
end
83
end
84
85
return data_out
86
end
87
88
function multiply_dimensionwise_sequential_tullio!(
89
data_out::AbstractArray{<:Any, 3}, data_in::AbstractArray{<:Any, 3}, vandermonde,
90
tmp1=zeros(eltype(data_out), size(data_out, 1), size(vandermonde, 1), size(vandermonde, 2)))
91
92
# Interpolate in x-direction
93
@tullio threads=false tmp1[v, i, j] = vandermonde[i, ii] * data_in[v, ii, j]
94
95
# Interpolate in y-direction
96
@tullio threads=false data_out[v, i, j] = vandermonde[j, jj] * tmp1[v, i, jj]
97
98
return data_out
99
end
100
101
@generated function multiply_dimensionwise_sequential_nexpr!(
102
data_out::AbstractArray{<:Any, 3}, data_in::AbstractArray{<:Any, 3}, vandermonde::SMatrix{n_nodes_out,n_nodes_in}, ::Val{n_vars}) where {n_nodes_out, n_nodes_in, n_vars}
103
quote
104
@boundscheck begin
105
inbounds = (size(data_out, 1) == $n_vars) &&
106
(size(data_out, 2) == size(data_out, 3) == $n_nodes_out) &&
107
(size(data_in, 1) == $n_vars) &&
108
(size(data_in, 2) == size(data_in, 3) == $n_nodes_in)
109
inbounds || throw(BoundsError())
110
end
111
112
# Interpolate in x-direction
113
@inbounds @muladd Base.Cartesian.@nexprs $n_nodes_in j -> begin
114
Base.Cartesian.@nexprs $n_nodes_out i -> begin
115
Base.Cartesian.@nexprs $n_vars v -> begin
116
tmp1_v_i_j = zero(eltype(data_out))
117
Base.Cartesian.@nexprs $n_nodes_in ii -> begin
118
tmp1_v_i_j += vandermonde[i, ii] * data_in[v, ii, j]
119
end
120
end
121
end
122
end
123
124
# Interpolate in y-direction
125
@inbounds @muladd Base.Cartesian.@nexprs $n_nodes_out j -> begin
126
Base.Cartesian.@nexprs $n_nodes_out i -> begin
127
Base.Cartesian.@nexprs $n_vars v -> begin
128
tmp2_v_i_j = zero(eltype(data_out))
129
Base.Cartesian.@nexprs $n_nodes_in jj -> begin
130
tmp2_v_i_j += vandermonde[j, jj] * tmp1_v_i_jj
131
end
132
data_out[v, i, j] = tmp2_v_i_j
133
end
134
end
135
end
136
137
return data_out
138
end
139
end
140
141
142
function multiply_dimensionwise_squeezed!(
143
data_out::AbstractArray{<:Any, 3}, data_in::AbstractArray{<:Any, 3}, vandermonde)
144
n_vars = size(data_out, 1)
145
n_nodes_out = size(vandermonde, 1)
146
n_nodes_in = size(vandermonde, 2)
147
data_out .= zero(eltype(data_out))
148
149
@boundscheck begin
150
inbounds = (size(data_out, 2) == size(data_out, 3) == n_nodes_out) &&
151
(size(data_in, 1) == n_vars) &&
152
(size(data_in, 2) == size(data_in, 3) == n_nodes_in)
153
inbounds || throw(BoundsError())
154
end
155
156
@inbounds for j in 1:n_nodes_out, i in 1:n_nodes_out
157
for v in 1:n_vars
158
acc = zero(eltype(data_out))
159
for jj in 1:n_nodes_in, ii in 1:n_nodes_in
160
acc += vandermonde[i, ii]* vandermonde[j, jj] * data_in[v, ii, jj]
161
end
162
data_out[v, i, j] = acc
163
end
164
end
165
166
return data_out
167
end
168
169
function multiply_dimensionwise_squeezed_avx!(
170
data_out::AbstractArray{<:Any, 3}, data_in::AbstractArray{<:Any, 3}, vandermonde)
171
n_vars = size(data_out, 1)
172
n_nodes_out = size(vandermonde, 1)
173
n_nodes_in = size(vandermonde, 2)
174
data_out .= zero(eltype(data_out))
175
176
@boundscheck begin
177
inbounds = (size(data_out, 2) == size(data_out, 3) == n_nodes_out) &&
178
(size(data_in, 1) == n_vars) &&
179
(size(data_in, 2) == size(data_in, 3) == n_nodes_in)
180
inbounds || throw(BoundsError())
181
end
182
183
@avx for j in 1:n_nodes_out, i in 1:n_nodes_out
184
for v in 1:n_vars
185
acc = zero(eltype(data_out))
186
for jj in 1:n_nodes_in, ii in 1:n_nodes_in
187
acc += vandermonde[i, ii] * vandermonde[j, jj] * data_in[v, ii, jj]
188
end
189
data_out[v, i, j] = acc
190
end
191
end
192
193
return data_out
194
end
195
196
function multiply_dimensionwise_squeezed_tullio!(
197
data_out::AbstractArray{<:Any, 3}, data_in::AbstractArray{<:Any, 3}, vandermonde)
198
199
@tullio threads=false data_out[v, i, j] = vandermonde[i, ii] * vandermonde[j, jj] * data_in[v, ii, jj]
200
201
return data_out
202
end
203
204
205
function run_benchmarks_2d(n_vars=4, n_nodes_in=4, n_nodes_out=2*n_nodes_in)
206
data_in = randn(n_vars, n_nodes_in, n_nodes_in)
207
data_out = randn(n_vars, n_nodes_out, n_nodes_out)
208
vandermonde_dynamic = randn(n_nodes_out, n_nodes_in)
209
vandermonde_static = SMatrix{n_nodes_out, n_nodes_in}(vandermonde_dynamic)
210
vandermonde_mmatrix = MMatrix{n_nodes_out, n_nodes_in}(vandermonde_dynamic)
211
212
println("\n\n# 2D ", "#"^70)
213
println("n_vars = ", n_vars)
214
println("n_nodes_in = ", n_nodes_in)
215
println("n_nodes_out = ", n_nodes_out)
216
println()
217
218
println("\n","multiply_dimensionwise_sequential!")
219
println("vandermonde_dynamic")
220
multiply_dimensionwise_sequential!(data_out, data_in, vandermonde_dynamic)
221
data_out_copy = copy(data_out)
222
display(@benchmark multiply_dimensionwise_sequential!($(data_out), $(data_in), $(vandermonde_dynamic)))
223
println("\n", "vandermonde_static")
224
multiply_dimensionwise_sequential!(data_out, data_in, vandermonde_static)
225
@assert data_out data_out_copy
226
display(@benchmark multiply_dimensionwise_sequential!($(data_out), $(data_in), $(vandermonde_static)))
227
println("\n", "vandermonde_mmatrix")
228
multiply_dimensionwise_sequential!(data_out, data_in, vandermonde_mmatrix)
229
@assert data_out data_out_copy
230
display(@benchmark multiply_dimensionwise_sequential!($(data_out), $(data_in), $(vandermonde_mmatrix)))
231
println()
232
233
println("\n","multiply_dimensionwise_sequential_avx!")
234
println("vandermonde_dynamic")
235
multiply_dimensionwise_sequential_avx!(data_out, data_in, vandermonde_dynamic)
236
@assert data_out data_out_copy
237
display(@benchmark multiply_dimensionwise_sequential_avx!($(data_out), $(data_in), $(vandermonde_dynamic)))
238
println("\n", "vandermonde_static")
239
multiply_dimensionwise_sequential_avx!(data_out, data_in, vandermonde_static)
240
@assert data_out data_out_copy
241
display(@benchmark multiply_dimensionwise_sequential_avx!($(data_out), $(data_in), $(vandermonde_static)))
242
println("\n", "vandermonde_mmatrix")
243
multiply_dimensionwise_sequential_avx!(data_out, data_in, vandermonde_mmatrix)
244
@assert data_out data_out_copy
245
display(@benchmark multiply_dimensionwise_sequential_avx!($(data_out), $(data_in), $(vandermonde_mmatrix)))
246
println()
247
248
println("\n","multiply_dimensionwise_sequential_tullio!")
249
println("vandermonde_dynamic")
250
multiply_dimensionwise_sequential_tullio!(data_out, data_in, vandermonde_dynamic)
251
@assert data_out data_out_copy
252
display(@benchmark multiply_dimensionwise_sequential_tullio!($(data_out), $(data_in), $(vandermonde_dynamic)))
253
println("\n", "vandermonde_static")
254
multiply_dimensionwise_sequential_tullio!(data_out, data_in, vandermonde_static)
255
@assert data_out data_out_copy
256
display(@benchmark multiply_dimensionwise_sequential_tullio!($(data_out), $(data_in), $(vandermonde_static)))
257
println("\n", "vandermonde_mmatrix")
258
multiply_dimensionwise_sequential_tullio!(data_out, data_in, vandermonde_mmatrix)
259
@assert data_out data_out_copy
260
display(@benchmark multiply_dimensionwise_sequential_tullio!($(data_out), $(data_in), $(vandermonde_mmatrix)))
261
println()
262
263
println("\n","multiply_dimensionwise_sequential_nexpr!")
264
println("vandermonde_static")
265
multiply_dimensionwise_sequential_nexpr!(data_out, data_in, vandermonde_static, Val(n_vars))
266
@assert data_out data_out_copy
267
display(@benchmark multiply_dimensionwise_sequential_nexpr!($(data_out), $(data_in), $(vandermonde_static), $(Val(n_vars))))
268
println()
269
270
271
println("\n","multiply_dimensionwise_squeezed!")
272
println("vandermonde_dynamic")
273
multiply_dimensionwise_squeezed!(data_out, data_in, vandermonde_dynamic)
274
@assert data_out data_out_copy
275
display(@benchmark multiply_dimensionwise_squeezed!($(data_out), $(data_in), $(vandermonde_dynamic)))
276
println("\n", "vandermonde_static")
277
multiply_dimensionwise_squeezed!(data_out, data_in, vandermonde_static)
278
@assert data_out data_out_copy
279
display(@benchmark multiply_dimensionwise_squeezed!($(data_out), $(data_in), $(vandermonde_static)))
280
println("\n", "vandermonde_mmatrix")
281
multiply_dimensionwise_squeezed!(data_out, data_in, vandermonde_mmatrix)
282
@assert data_out data_out_copy
283
display(@benchmark multiply_dimensionwise_squeezed!($(data_out), $(data_in), $(vandermonde_mmatrix)))
284
println()
285
286
println("\n","multiply_dimensionwise_squeezed_avx!")
287
println("vandermonde_dynamic")
288
multiply_dimensionwise_squeezed_avx!(data_out, data_in, vandermonde_dynamic)
289
@assert data_out data_out_copy
290
display(@benchmark multiply_dimensionwise_squeezed_avx!($(data_out), $(data_in), $(vandermonde_dynamic)))
291
println("\n", "vandermonde_static")
292
multiply_dimensionwise_squeezed_avx!(data_out, data_in, vandermonde_static)
293
@assert data_out data_out_copy
294
display(@benchmark multiply_dimensionwise_squeezed_avx!($(data_out), $(data_in), $(vandermonde_static)))
295
println("\n", "vandermonde_mmatrix")
296
multiply_dimensionwise_squeezed_avx!(data_out, data_in, vandermonde_mmatrix)
297
@assert data_out data_out_copy
298
display(@benchmark multiply_dimensionwise_squeezed_avx!($(data_out), $(data_in), $(vandermonde_mmatrix)))
299
println()
300
301
println("\n","multiply_dimensionwise_squeezed_tullio!")
302
println("vandermonde_dynamic")
303
multiply_dimensionwise_squeezed_tullio!(data_out, data_in, vandermonde_dynamic)
304
@assert data_out data_out_copy
305
display(@benchmark multiply_dimensionwise_squeezed_tullio!($(data_out), $(data_in), $(vandermonde_dynamic)))
306
println("\n", "vandermonde_static")
307
multiply_dimensionwise_squeezed_tullio!(data_out, data_in, vandermonde_static)
308
@assert data_out data_out_copy
309
display(@benchmark multiply_dimensionwise_squeezed_tullio!($(data_out), $(data_in), $(vandermonde_static)))
310
println("\n", "vandermonde_mmatrix")
311
multiply_dimensionwise_squeezed_tullio!(data_out, data_in, vandermonde_mmatrix)
312
@assert data_out data_out_copy
313
display(@benchmark multiply_dimensionwise_squeezed_tullio!($(data_out), $(data_in), $(vandermonde_mmatrix)))
314
println()
315
316
nothing
317
end
318
319
# TODO
320
run_benchmarks_2d(4, 4, 4) # n_vars, n_nodes_in, n_nodes_out
321
322
323
function compute_benchmarks_2d(n_vars, n_nodes_in, n_nodes_out)
324
data_in = randn(n_vars, n_nodes_in, n_nodes_in)
325
data_out = randn(n_vars, n_nodes_out, n_nodes_out)
326
vandermonde_dynamic = randn(n_nodes_out, n_nodes_in)
327
vandermonde_static = SMatrix{n_nodes_out, n_nodes_in}(vandermonde_dynamic)
328
tmp1 = zeros(eltype(data_out), n_vars, n_nodes_out, n_nodes_in)
329
330
println("n_vars = ", n_vars, "; n_nodes_in = ", n_nodes_in, "; n_nodes_out = ", n_nodes_out)
331
sequential_dynamic = @benchmark multiply_dimensionwise_sequential_tullio!($(data_out), $(data_in), $(vandermonde_dynamic))
332
sequential_static = @benchmark multiply_dimensionwise_sequential_tullio!($(data_out), $(data_in), $(vandermonde_static))
333
#FIXME sequential_nexpr = @benchmark multiply_dimensionwise_sequential_nexpr!($(data_out), $(data_in), $(vandermonde_static), $(Val(n_vars)))
334
sequential_dynamic_prealloc = @benchmark multiply_dimensionwise_sequential_tullio!($(data_out), $(data_in), $(vandermonde_dynamic), $(tmp1))
335
sequential_static_prealloc = @benchmark multiply_dimensionwise_sequential_tullio!($(data_out), $(data_in), $(vandermonde_static), $(tmp1))
336
squeezed_dynamic = @benchmark multiply_dimensionwise_squeezed_tullio!($(data_out), $(data_in), $(vandermonde_dynamic))
337
squeezed_static = @benchmark multiply_dimensionwise_squeezed_tullio!($(data_out), $(data_in), $(vandermonde_static))
338
339
return time(median(sequential_dynamic)),
340
time(median(sequential_static)),
341
NaN, #FIXME time(median(sequential_nexpr)),
342
time(median(sequential_dynamic_prealloc)),
343
time(median(sequential_static_prealloc)),
344
time(median(squeezed_dynamic)),
345
time(median(squeezed_static))
346
end
347
348
function compute_benchmarks_2d(n_vars_list, n_nodes_in_list)
349
sequential_dynamic = zeros(length(n_vars_list), length(n_nodes_in_list))
350
sequential_static = zeros(length(n_vars_list), length(n_nodes_in_list))
351
sequential_nexpr = zeros(length(n_vars_list), length(n_nodes_in_list))
352
sequential_dynamic_prealloc = zeros(length(n_vars_list), length(n_nodes_in_list))
353
sequential_static_prealloc = zeros(length(n_vars_list), length(n_nodes_in_list))
354
squeezed_dynamic = zeros(length(n_vars_list), length(n_nodes_in_list))
355
squeezed_static = zeros(length(n_vars_list), length(n_nodes_in_list))
356
357
# n_nodes_out = n_nodes_in
358
# mortar
359
# superset of n_vars = 1, n_nodes_out = n_nodes_in, used for blending
360
for (idx_nodes, n_nodes_in) in enumerate(n_nodes_in_list)
361
for (idx_variables, n_vars) in enumerate(n_vars_list)
362
n_nodes_out = n_nodes_in
363
sequential_dynamic[idx_variables, idx_nodes],
364
sequential_static[idx_variables, idx_nodes],
365
sequential_nexpr[idx_variables, idx_nodes],
366
sequential_dynamic_prealloc[idx_variables, idx_nodes],
367
sequential_static_prealloc[idx_variables, idx_nodes],
368
squeezed_dynamic[idx_variables, idx_nodes],
369
squeezed_static[idx_variables, idx_nodes] =
370
compute_benchmarks_2d(n_vars, n_nodes_in, n_nodes_out)
371
end
372
end
373
BSON.@save "2D_nVarTotal_nNodesIn.bson" n_vars_list n_nodes_in_list sequential_dynamic sequential_static sequential_nexpr sequential_dynamic_prealloc sequential_static_prealloc squeezed_dynamic squeezed_static
374
375
# n_nodes_out = 2*n_nodes_in
376
# visualization
377
for (idx_nodes, n_nodes_in) in enumerate(n_nodes_in_list)
378
for (idx_variables, n_vars) in enumerate(n_vars_list)
379
n_nodes_out = 2 * n_nodes_in
380
sequential_dynamic[idx_variables, idx_nodes],
381
sequential_static[idx_variables, idx_nodes],
382
sequential_nexpr[idx_variables, idx_nodes],
383
sequential_dynamic_prealloc[idx_variables, idx_nodes],
384
sequential_static_prealloc[idx_variables, idx_nodes],
385
squeezed_dynamic[idx_variables, idx_nodes],
386
squeezed_static[idx_variables, idx_nodes] =
387
compute_benchmarks_2d(n_vars, n_nodes_in, n_nodes_out)
388
end
389
end
390
BSON.@save "2D_nVarTotal_2nNodesIn.bson" n_vars_list n_nodes_in_list sequential_dynamic sequential_static sequential_nexpr sequential_dynamic_prealloc sequential_static_prealloc squeezed_dynamic squeezed_static
391
392
return nothing
393
end
394
395
# TODO
396
compute_benchmarks_2d(1:10, 2:10)
397
398
399
###################################################################################################
400
# 3D versions
401
402
function multiply_dimensionwise_sequential!(
403
data_out::AbstractArray{<:Any, 4}, data_in::AbstractArray{<:Any, 4}, vandermonde,
404
tmp1=zeros(eltype(data_out), size(data_out, 1), size(vandermonde, 1), size(vandermonde, 2), size(vandermonde, 2)),
405
tmp2=zeros(eltype(data_out), size(data_out, 1), size(vandermonde, 1), size(vandermonde, 1), size(vandermonde, 2)))
406
n_vars = size(data_out, 1)
407
n_nodes_out = size(vandermonde, 1)
408
n_nodes_in = size(vandermonde, 2)
409
data_out .= zero(eltype(data_out))
410
411
@boundscheck begin
412
inbounds = (size(data_out, 2) == size(data_out, 3) == size(data_out, 4) == n_nodes_out) &&
413
(size(data_in, 1) == n_vars) &&
414
(size(data_in, 2) == size(data_in, 3) == size(data_in, 4) == n_nodes_in)
415
inbounds || throw(BoundsError())
416
end
417
418
# Interpolate in x-direction
419
@inbounds for k in 1:n_nodes_in, j in 1:n_nodes_in, i in 1:n_nodes_out
420
for ii in 1:n_nodes_in
421
for v in 1:n_vars
422
tmp1[v, i, j, k] += vandermonde[i, ii] * data_in[v, ii, j, k]
423
end
424
end
425
end
426
427
# Interpolate in y-direction
428
@inbounds for k in 1:n_nodes_in, j in 1:n_nodes_out, i in 1:n_nodes_out
429
for jj in 1:n_nodes_in
430
for v in 1:n_vars
431
tmp2[v, i, j, k] += vandermonde[j, jj] * tmp1[v, i, jj, k]
432
end
433
end
434
end
435
436
# Interpolate in z-direction
437
@inbounds for k in 1:n_nodes_out, j in 1:n_nodes_out, i in 1:n_nodes_out
438
for kk in 1:n_nodes_in
439
for v in 1:n_vars
440
data_out[v, i, j, k] += vandermonde[k, kk] * tmp2[v, i, j, kk]
441
end
442
end
443
end
444
445
return data_out
446
end
447
448
function multiply_dimensionwise_sequential_avx!(
449
data_out::AbstractArray{<:Any, 4}, data_in::AbstractArray{<:Any, 4}, vandermonde,
450
tmp1=zeros(eltype(data_out), size(data_out, 1), size(vandermonde, 1), size(vandermonde, 2), size(vandermonde, 2)),
451
tmp2=zeros(eltype(data_out), size(data_out, 1), size(vandermonde, 1), size(vandermonde, 1), size(vandermonde, 2)))
452
n_vars = size(data_out, 1)
453
n_nodes_out = size(vandermonde, 1)
454
n_nodes_in = size(vandermonde, 2)
455
data_out .= zero(eltype(data_out))
456
457
@boundscheck begin
458
inbounds = (size(data_out, 2) == size(data_out, 3) == size(data_out, 4) == n_nodes_out) &&
459
(size(data_in, 1) == n_vars) &&
460
(size(data_in, 2) == size(data_in, 3) == size(data_in, 4) == n_nodes_in)
461
inbounds || throw(BoundsError())
462
end
463
464
# Interpolate in x-direction
465
@avx for k in 1:n_nodes_in, j in 1:n_nodes_in, i in 1:n_nodes_out
466
for ii in 1:n_nodes_in
467
for v in 1:n_vars
468
tmp1[v, i, j, k] += vandermonde[i, ii] * data_in[v, ii, j, k]
469
end
470
end
471
end
472
473
# Interpolate in y-direction
474
@avx for k in 1:n_nodes_in, j in 1:n_nodes_out, i in 1:n_nodes_out
475
for jj in 1:n_nodes_in
476
for v in 1:n_vars
477
tmp2[v, i, j, k] += vandermonde[j, jj] * tmp1[v, i, jj, k]
478
end
479
end
480
end
481
482
# Interpolate in z-direction
483
@avx for k in 1:n_nodes_out, j in 1:n_nodes_out, i in 1:n_nodes_out
484
for kk in 1:n_nodes_in
485
for v in 1:n_vars
486
data_out[v, i, j, k] += vandermonde[k, kk] * tmp2[v, i, j, kk]
487
end
488
end
489
end
490
491
return data_out
492
end
493
494
function multiply_dimensionwise_sequential_tullio!(
495
data_out::AbstractArray{<:Any, 4}, data_in::AbstractArray{<:Any, 4}, vandermonde,
496
tmp1=zeros(eltype(data_out), size(data_out, 1), size(vandermonde, 1), size(vandermonde, 2), size(vandermonde, 2)),
497
tmp2=zeros(eltype(data_out), size(data_out, 1), size(vandermonde, 1), size(vandermonde, 1), size(vandermonde, 2)))
498
499
# Interpolate in x-direction
500
@tullio threads=false tmp1[v, i, j, k] = vandermonde[i, ii] * data_in[v, ii, j, k]
501
502
# Interpolate in y-direction
503
@tullio threads=false tmp2[v, i, j, k] = vandermonde[j, jj] * tmp1[v, i, jj, k]
504
505
# Interpolate in z-direction
506
@tullio threads=false data_out[v, i, j, k] = vandermonde[k, kk] * tmp2[v, i, j, kk]
507
508
return data_out
509
end
510
511
@generated function multiply_dimensionwise_sequential_nexpr!(
512
data_out::AbstractArray{<:Any, 4}, data_in::AbstractArray{<:Any, 4}, vandermonde::SMatrix{n_nodes_out,n_nodes_in}, ::Val{n_vars}) where {n_nodes_out, n_nodes_in, n_vars}
513
quote
514
@boundscheck begin
515
inbounds = (size(data_out, 1) == $n_vars) &&
516
(size(data_out, 2) == size(data_out, 3) == size(data_out, 4) == $n_nodes_out) &&
517
(size(data_in, 1) == $n_vars) &&
518
(size(data_in, 2) == size(data_in, 3) == size(data_in, 4) == $n_nodes_in)
519
inbounds || throw(BoundsError())
520
end
521
522
# Interpolate in x-direction
523
@inbounds @muladd Base.Cartesian.@nexprs $n_nodes_in k -> begin
524
Base.Cartesian.@nexprs $n_nodes_in j -> begin
525
Base.Cartesian.@nexprs $n_nodes_out i -> begin
526
Base.Cartesian.@nexprs $n_vars v -> begin
527
tmp1_v_i_j_k = zero(eltype(data_out))
528
Base.Cartesian.@nexprs $n_nodes_in ii -> begin
529
tmp1_v_i_j_k += vandermonde[i, ii] * data_in[v, ii, j, k]
530
end
531
end
532
end
533
end
534
end
535
536
# Interpolate in y-direction
537
@inbounds @muladd Base.Cartesian.@nexprs $n_nodes_in k -> begin
538
Base.Cartesian.@nexprs $n_nodes_out j -> begin
539
Base.Cartesian.@nexprs $n_nodes_out i -> begin
540
Base.Cartesian.@nexprs $n_vars v -> begin
541
tmp2_v_i_j_k = zero(eltype(data_out))
542
Base.Cartesian.@nexprs $n_nodes_in jj -> begin
543
tmp2_v_i_j_k += vandermonde[j, jj] * tmp1_v_i_jj_k
544
end
545
end
546
end
547
end
548
end
549
550
# Interpolate in z-direction
551
@inbounds @muladd Base.Cartesian.@nexprs $n_nodes_out k -> begin
552
Base.Cartesian.@nexprs $n_nodes_out j -> begin
553
Base.Cartesian.@nexprs $n_nodes_out i -> begin
554
Base.Cartesian.@nexprs $n_vars v -> begin
555
tmp3_v_i_j_k = zero(eltype(data_out))
556
Base.Cartesian.@nexprs $n_nodes_in kk -> begin
557
tmp3_v_i_j_k += vandermonde[k, kk] * tmp2_v_i_j_kk
558
end
559
data_out[v, i, j, k] = tmp3_v_i_j_k
560
end
561
end
562
end
563
end
564
565
return data_out
566
end
567
end
568
569
570
function multiply_dimensionwise_squeezed!(
571
data_out::AbstractArray{<:Any, 4}, data_in::AbstractArray{<:Any, 4}, vandermonde)
572
n_vars = size(data_out, 1)
573
n_nodes_out = size(vandermonde, 1)
574
n_nodes_in = size(vandermonde, 2)
575
data_out .= zero(eltype(data_out))
576
577
@boundscheck begin
578
inbounds = (size(data_out, 2) == size(data_out, 3) == size(data_out, 4) == n_nodes_out) &&
579
(size(data_in, 1) == n_vars) &&
580
(size(data_in, 2) == size(data_in, 3) == size(data_in, 4) == n_nodes_in)
581
inbounds || throw(BoundsError())
582
end
583
584
@inbounds for k in 1:n_nodes_out, j in 1:n_nodes_out, i in 1:n_nodes_out
585
for v in 1:n_vars
586
acc = zero(eltype(data_out))
587
for kk in 1:n_nodes_in, jj in 1:n_nodes_in, ii in 1:n_nodes_in
588
acc += vandermonde[i, ii] * vandermonde[j, jj] * vandermonde[k, kk] * data_in[v, ii, jj, kk]
589
end
590
data_out[v, i, j, k] = acc
591
end
592
end
593
594
return data_out
595
end
596
597
function multiply_dimensionwise_squeezed_avx!(
598
data_out::AbstractArray{<:Any, 4}, data_in::AbstractArray{<:Any, 4}, vandermonde)
599
n_vars = size(data_out, 1)
600
n_nodes_out = size(vandermonde, 1)
601
n_nodes_in = size(vandermonde, 2)
602
data_out .= zero(eltype(data_out))
603
604
@boundscheck begin
605
inbounds = (size(data_out, 2) == size(data_out, 3) == size(data_out, 4) == n_nodes_out) &&
606
(size(data_in, 1) == n_vars) &&
607
(size(data_in, 2) == size(data_in, 3) == size(data_in, 4) == n_nodes_in)
608
inbounds || throw(BoundsError())
609
end
610
611
@avx for k in 1:n_nodes_out, j in 1:n_nodes_out, i in 1:n_nodes_out
612
for v in 1:n_vars
613
acc = zero(eltype(data_out))
614
for kk in 1:n_nodes_in, jj in 1:n_nodes_in, ii in 1:n_nodes_in
615
acc += vandermonde[i, ii] * vandermonde[j, jj] * vandermonde[k, kk] * data_in[v, ii, jj, kk]
616
end
617
data_out[v, i, j, k] = acc
618
end
619
end
620
621
return data_out
622
end
623
624
function multiply_dimensionwise_squeezed_tullio!(
625
data_out::AbstractArray{<:Any, 4}, data_in::AbstractArray{<:Any, 4}, vandermonde)
626
627
@tullio threads=false data_out[v, i, j, k] = vandermonde[i, ii] * vandermonde[j, jj] * vandermonde[k, kk] * data_in[v, ii, jj, kk]
628
629
return data_out
630
end
631
632
633
function run_benchmarks_3d(n_vars=4, n_nodes_in=4, n_nodes_out=2*n_nodes_in)
634
data_in = randn(n_vars, n_nodes_in, n_nodes_in, n_nodes_in)
635
data_out = randn(n_vars, n_nodes_out, n_nodes_out, n_nodes_out)
636
vandermonde_dynamic = randn(n_nodes_out, n_nodes_in)
637
vandermonde_static = SMatrix{n_nodes_out, n_nodes_in}(vandermonde_dynamic)
638
639
println("\n\n# 3D ", "#"^70)
640
println("n_vars = ", n_vars)
641
println("n_nodes_in = ", n_nodes_in)
642
println("n_nodes_out = ", n_nodes_out)
643
println()
644
645
println("\n","multiply_dimensionwise_sequential!")
646
println("vandermonde_dynamic")
647
multiply_dimensionwise_sequential!(data_out, data_in, vandermonde_dynamic)
648
data_out_copy = copy(data_out)
649
display(@benchmark multiply_dimensionwise_sequential!($(data_out), $(data_in), $(vandermonde_dynamic)))
650
println("\n", "vandermonde_static")
651
multiply_dimensionwise_sequential!(data_out, data_in, vandermonde_static)
652
@assert data_out data_out_copy
653
display(@benchmark multiply_dimensionwise_sequential!($(data_out), $(data_in), $(vandermonde_static)))
654
println()
655
656
println("\n","multiply_dimensionwise_sequential_avx!")
657
println("vandermonde_dynamic")
658
multiply_dimensionwise_sequential_avx!(data_out, data_in, vandermonde_dynamic)
659
@assert data_out data_out_copy
660
display(@benchmark multiply_dimensionwise_sequential_avx!($(data_out), $(data_in), $(vandermonde_dynamic)))
661
println("\n", "vandermonde_static")
662
multiply_dimensionwise_sequential_avx!(data_out, data_in, vandermonde_static)
663
@assert data_out data_out_copy
664
display(@benchmark multiply_dimensionwise_sequential_avx!($(data_out), $(data_in), $(vandermonde_static)))
665
println()
666
667
println("\n","multiply_dimensionwise_sequential_tullio!")
668
println("vandermonde_dynamic")
669
multiply_dimensionwise_sequential_tullio!(data_out, data_in, vandermonde_dynamic)
670
@assert data_out data_out_copy
671
display(@benchmark multiply_dimensionwise_sequential_tullio!($(data_out), $(data_in), $(vandermonde_dynamic)))
672
println("\n", "vandermonde_static")
673
multiply_dimensionwise_sequential_tullio!(data_out, data_in, vandermonde_static)
674
@assert data_out data_out_copy
675
display(@benchmark multiply_dimensionwise_sequential_tullio!($(data_out), $(data_in), $(vandermonde_static)))
676
println()
677
678
println("\n","multiply_dimensionwise_sequential_nexpr!")
679
println("vandermonde_static")
680
multiply_dimensionwise_sequential_nexpr!(data_out, data_in, vandermonde_static, Val(n_vars))
681
@assert data_out data_out_copy
682
display(@benchmark multiply_dimensionwise_sequential_nexpr!($(data_out), $(data_in), $(vandermonde_static), $(Val(n_vars))))
683
println()
684
685
println("\n","multiply_dimensionwise_squeezed!")
686
println("vandermonde_dynamic")
687
multiply_dimensionwise_squeezed!(data_out, data_in, vandermonde_dynamic)
688
@assert data_out data_out_copy
689
display(@benchmark multiply_dimensionwise_squeezed!($(data_out), $(data_in), $(vandermonde_dynamic)))
690
println("\n", "vandermonde_static")
691
multiply_dimensionwise_squeezed!(data_out, data_in, vandermonde_static)
692
@assert data_out data_out_copy
693
display(@benchmark multiply_dimensionwise_squeezed!($(data_out), $(data_in), $(vandermonde_static)))
694
println()
695
696
println("\n","multiply_dimensionwise_squeezed_avx!")
697
println("vandermonde_dynamic")
698
multiply_dimensionwise_squeezed_avx!(data_out, data_in, vandermonde_dynamic)
699
@assert data_out data_out_copy
700
display(@benchmark multiply_dimensionwise_squeezed_avx!($(data_out), $(data_in), $(vandermonde_dynamic)))
701
println("\n", "vandermonde_static")
702
multiply_dimensionwise_squeezed_avx!(data_out, data_in, vandermonde_static)
703
@assert data_out data_out_copy
704
display(@benchmark multiply_dimensionwise_squeezed_avx!($(data_out), $(data_in), $(vandermonde_static)))
705
println()
706
707
println("\n","multiply_dimensionwise_squeezed_tullio!")
708
println("vandermonde_dynamic")
709
multiply_dimensionwise_squeezed_tullio!(data_out, data_in, vandermonde_dynamic)
710
@assert data_out data_out_copy
711
display(@benchmark multiply_dimensionwise_squeezed_tullio!($(data_out), $(data_in), $(vandermonde_dynamic)))
712
println("\n", "vandermonde_static")
713
multiply_dimensionwise_squeezed_tullio!(data_out, data_in, vandermonde_static)
714
@assert data_out data_out_copy
715
display(@benchmark multiply_dimensionwise_squeezed_tullio!($(data_out), $(data_in), $(vandermonde_static)))
716
println()
717
718
nothing
719
end
720
721
# TODO
722
run_benchmarks_3d(5, 4, 4) # n_vars, n_nodes_in, n_nodes_out
723
724
725
function compute_benchmarks_3d(n_vars, n_nodes_in, n_nodes_out)
726
data_in = randn(n_vars, n_nodes_in, n_nodes_in, n_nodes_in)
727
data_out = randn(n_vars, n_nodes_out, n_nodes_out, n_nodes_out)
728
vandermonde_dynamic = randn(n_nodes_out, n_nodes_in)
729
vandermonde_static = SMatrix{n_nodes_out, n_nodes_in}(vandermonde_dynamic)
730
tmp1 = zeros(eltype(data_out), n_vars, n_nodes_out, n_nodes_in, n_nodes_in)
731
tmp2 = zeros(eltype(data_out), n_vars, n_nodes_out, n_nodes_out, n_nodes_in)
732
733
println("n_vars = ", n_vars, "; n_nodes_in = ", n_nodes_in, "; n_nodes_out = ", n_nodes_out)
734
sequential_dynamic = @benchmark multiply_dimensionwise_sequential_tullio!($(data_out), $(data_in), $(vandermonde_dynamic))
735
sequential_static = @benchmark multiply_dimensionwise_sequential_tullio!($(data_out), $(data_in), $(vandermonde_static))
736
#FIXME sequential_nexpr = @benchmark multiply_dimensionwise_sequential_nexpr!($(data_out), $(data_in), $(vandermonde_static), $(Val(n_vars)))
737
sequential_dynamic_prealloc = @benchmark multiply_dimensionwise_sequential_tullio!($(data_out), $(data_in), $(vandermonde_dynamic), $(tmp1), $(tmp2))
738
sequential_static_prealloc = @benchmark multiply_dimensionwise_sequential_tullio!($(data_out), $(data_in), $(vandermonde_static), $(tmp1), $(tmp2))
739
squeezed_dynamic = @benchmark multiply_dimensionwise_squeezed_tullio!($(data_out), $(data_in), $(vandermonde_dynamic))
740
squeezed_static = @benchmark multiply_dimensionwise_squeezed_tullio!($(data_out), $(data_in), $(vandermonde_static))
741
742
return time(median(sequential_dynamic)),
743
time(median(sequential_static)),
744
NaN, #FIXME time(median(sequential_nexpr)),
745
time(median(sequential_dynamic_prealloc)),
746
time(median(sequential_static_prealloc)),
747
time(median(squeezed_dynamic)),
748
time(median(squeezed_static))
749
end
750
751
function compute_benchmarks_3d(n_vars_list, n_nodes_in_list)
752
sequential_dynamic = zeros(length(n_vars_list), length(n_nodes_in_list))
753
sequential_static = zeros(length(n_vars_list), length(n_nodes_in_list))
754
sequential_nexpr = zeros(length(n_vars_list), length(n_nodes_in_list))
755
sequential_dynamic_prealloc = zeros(length(n_vars_list), length(n_nodes_in_list))
756
sequential_static_prealloc = zeros(length(n_vars_list), length(n_nodes_in_list))
757
squeezed_dynamic = zeros(length(n_vars_list), length(n_nodes_in_list))
758
squeezed_static = zeros(length(n_vars_list), length(n_nodes_in_list))
759
760
# n_nodes_out = n_nodes_in
761
# mortar
762
# superset of n_vars = 1, n_nodes_out = n_nodes_in, used for blending
763
for (idx_nodes, n_nodes_in) in enumerate(n_nodes_in_list)
764
for (idx_variables, n_vars) in enumerate(n_vars_list)
765
n_nodes_out = n_nodes_in
766
sequential_dynamic[idx_variables, idx_nodes],
767
sequential_static[idx_variables, idx_nodes],
768
sequential_nexpr[idx_variables, idx_nodes],
769
sequential_dynamic_prealloc[idx_variables, idx_nodes],
770
sequential_static_prealloc[idx_variables, idx_nodes],
771
squeezed_dynamic[idx_variables, idx_nodes],
772
squeezed_static[idx_variables, idx_nodes] =
773
compute_benchmarks_3d(n_vars, n_nodes_in, n_nodes_out)
774
end
775
end
776
BSON.@save "3D_nVarTotal_nNodesIn.bson" n_vars_list n_nodes_in_list sequential_dynamic sequential_static sequential_nexpr sequential_dynamic_prealloc sequential_static_prealloc squeezed_dynamic squeezed_static
777
778
# TODO deactivated to save some time
779
# # n_nodes_out = 2*n_nodes_in
780
# # visualization
781
# title = "n_vars = 2*n_vars, n_nodes_out = n_nodes_in"
782
# for (idx_nodes, n_nodes_in) in enumerate(n_nodes_in_list)
783
# for (idx_variables, n_vars) in enumerate(n_vars_list)
784
# n_nodes_out = 2 * n_nodes_in
785
# sequential_dynamic[idx_variables, idx_nodes],
786
# sequential_static[idx_variables, idx_nodes],
787
# sequential_nexpr[idx_variables, idx_nodes],
788
# sequential_dynamic_prealloc[idx_variables, idx_nodes],
789
# sequential_static_prealloc[idx_variables, idx_nodes],
790
# squeezed_dynamic[idx_variables, idx_nodes],
791
# squeezed_static[idx_variables, idx_nodes] =
792
# compute_benchmarks_3d(n_vars, n_nodes_in, n_nodes_out)
793
# end
794
# end
795
# BSON.@save "3D_nVarTotal_2nNodesIn.bson" n_vars_list n_nodes_in_list sequential_dynamic sequential_static sequential_nexpr sequential_dynamic_prealloc sequential_static_prealloc squeezed_dynamic squeezed_static
796
797
return nothing
798
end
799
800
801
# TODO
802
compute_benchmarks_3d(1:10, 2:10)
803
804