Path: blob/main/benchmark/multiply_dimensionwise/benchmark_multiply_dimensionwise.jl
2055 views
# Disable formatting this file since it contains highly unusual formatting for better1# readability2#! format: off34import Pkg; Pkg.activate(@__DIR__); Pkg.instantiate()56using BenchmarkTools7using BSON8using LoopVectorization9using MuladdMacro10using StaticArrays11using Tullio1213###################################################################################################14# 2D versions15function multiply_dimensionwise_sequential!(16data_out::AbstractArray{<:Any, 3}, data_in::AbstractArray{<:Any, 3}, vandermonde,17tmp1=zeros(eltype(data_out), size(data_out, 1), size(vandermonde, 1), size(vandermonde, 2)))18n_vars = size(data_out, 1)19n_nodes_out = size(vandermonde, 1)20n_nodes_in = size(vandermonde, 2)21data_out .= zero(eltype(data_out))2223@boundscheck begin24inbounds = (size(data_out, 2) == size(data_out, 3) == n_nodes_out) &&25(size(data_in, 1) == n_vars) &&26(size(data_in, 2) == size(data_in, 3) == n_nodes_in)27inbounds || throw(BoundsError())28end2930# Interpolate in x-direction31@inbounds for j in 1:n_nodes_in, i in 1:n_nodes_out32for ii in 1:n_nodes_in33for v in 1:n_vars34tmp1[v, i, j] += vandermonde[i, ii] * data_in[v, ii, j]35end36end37end3839# Interpolate in y-direction40@inbounds for j in 1:n_nodes_out, i in 1:n_nodes_out41for jj in 1:n_nodes_in42for v in 1:n_vars43data_out[v, i, j] += vandermonde[j, jj] * tmp1[v, i, jj]44end45end46end4748return data_out49end5051function multiply_dimensionwise_sequential_avx!(52data_out::AbstractArray{<:Any, 3}, data_in::AbstractArray{<:Any, 3}, vandermonde,53tmp1=zeros(eltype(data_out), size(data_out, 1), size(vandermonde, 1), size(vandermonde, 2)))54n_vars = size(data_out, 1)55n_nodes_out = size(vandermonde, 1)56n_nodes_in = size(vandermonde, 2)57data_out .= zero(eltype(data_out))5859@boundscheck begin60inbounds = (size(data_out, 2) == size(data_out, 3) == n_nodes_out) &&61(size(data_in, 1) == n_vars) &&62(size(data_in, 2) == size(data_in, 3) == n_nodes_in)63inbounds || throw(BoundsError())64end6566# Interpolate in x-direction67@avx for j in 1:n_nodes_in, i in 1:n_nodes_out68for ii in 1:n_nodes_in69for v in 1:n_vars70tmp1[v, i, j] += vandermonde[i, ii] * data_in[v, ii, j]71end72end73end7475# Interpolate in y-direction76@avx for j in 1:n_nodes_out, i in 1:n_nodes_out77for jj in 1:n_nodes_in78for v in 1:n_vars79data_out[v, i, j] += vandermonde[j, jj] * tmp1[v, i, jj]80end81end82end8384return data_out85end8687function multiply_dimensionwise_sequential_tullio!(88data_out::AbstractArray{<:Any, 3}, data_in::AbstractArray{<:Any, 3}, vandermonde,89tmp1=zeros(eltype(data_out), size(data_out, 1), size(vandermonde, 1), size(vandermonde, 2)))9091# Interpolate in x-direction92@tullio threads=false tmp1[v, i, j] = vandermonde[i, ii] * data_in[v, ii, j]9394# Interpolate in y-direction95@tullio threads=false data_out[v, i, j] = vandermonde[j, jj] * tmp1[v, i, jj]9697return data_out98end99100@generated function multiply_dimensionwise_sequential_nexpr!(101data_out::AbstractArray{<:Any, 3}, data_in::AbstractArray{<:Any, 3}, vandermonde::SMatrix{n_nodes_out,n_nodes_in}, ::Val{n_vars}) where {n_nodes_out, n_nodes_in, n_vars}102quote103@boundscheck begin104inbounds = (size(data_out, 1) == $n_vars) &&105(size(data_out, 2) == size(data_out, 3) == $n_nodes_out) &&106(size(data_in, 1) == $n_vars) &&107(size(data_in, 2) == size(data_in, 3) == $n_nodes_in)108inbounds || throw(BoundsError())109end110111# Interpolate in x-direction112@inbounds @muladd Base.Cartesian.@nexprs $n_nodes_in j -> begin113Base.Cartesian.@nexprs $n_nodes_out i -> begin114Base.Cartesian.@nexprs $n_vars v -> begin115tmp1_v_i_j = zero(eltype(data_out))116Base.Cartesian.@nexprs $n_nodes_in ii -> begin117tmp1_v_i_j += vandermonde[i, ii] * data_in[v, ii, j]118end119end120end121end122123# Interpolate in y-direction124@inbounds @muladd Base.Cartesian.@nexprs $n_nodes_out j -> begin125Base.Cartesian.@nexprs $n_nodes_out i -> begin126Base.Cartesian.@nexprs $n_vars v -> begin127tmp2_v_i_j = zero(eltype(data_out))128Base.Cartesian.@nexprs $n_nodes_in jj -> begin129tmp2_v_i_j += vandermonde[j, jj] * tmp1_v_i_jj130end131data_out[v, i, j] = tmp2_v_i_j132end133end134end135136return data_out137end138end139140141function multiply_dimensionwise_squeezed!(142data_out::AbstractArray{<:Any, 3}, data_in::AbstractArray{<:Any, 3}, vandermonde)143n_vars = size(data_out, 1)144n_nodes_out = size(vandermonde, 1)145n_nodes_in = size(vandermonde, 2)146data_out .= zero(eltype(data_out))147148@boundscheck begin149inbounds = (size(data_out, 2) == size(data_out, 3) == n_nodes_out) &&150(size(data_in, 1) == n_vars) &&151(size(data_in, 2) == size(data_in, 3) == n_nodes_in)152inbounds || throw(BoundsError())153end154155@inbounds for j in 1:n_nodes_out, i in 1:n_nodes_out156for v in 1:n_vars157acc = zero(eltype(data_out))158for jj in 1:n_nodes_in, ii in 1:n_nodes_in159acc += vandermonde[i, ii]* vandermonde[j, jj] * data_in[v, ii, jj]160end161data_out[v, i, j] = acc162end163end164165return data_out166end167168function multiply_dimensionwise_squeezed_avx!(169data_out::AbstractArray{<:Any, 3}, data_in::AbstractArray{<:Any, 3}, vandermonde)170n_vars = size(data_out, 1)171n_nodes_out = size(vandermonde, 1)172n_nodes_in = size(vandermonde, 2)173data_out .= zero(eltype(data_out))174175@boundscheck begin176inbounds = (size(data_out, 2) == size(data_out, 3) == n_nodes_out) &&177(size(data_in, 1) == n_vars) &&178(size(data_in, 2) == size(data_in, 3) == n_nodes_in)179inbounds || throw(BoundsError())180end181182@avx for j in 1:n_nodes_out, i in 1:n_nodes_out183for v in 1:n_vars184acc = zero(eltype(data_out))185for jj in 1:n_nodes_in, ii in 1:n_nodes_in186acc += vandermonde[i, ii] * vandermonde[j, jj] * data_in[v, ii, jj]187end188data_out[v, i, j] = acc189end190end191192return data_out193end194195function multiply_dimensionwise_squeezed_tullio!(196data_out::AbstractArray{<:Any, 3}, data_in::AbstractArray{<:Any, 3}, vandermonde)197198@tullio threads=false data_out[v, i, j] = vandermonde[i, ii] * vandermonde[j, jj] * data_in[v, ii, jj]199200return data_out201end202203204function run_benchmarks_2d(n_vars=4, n_nodes_in=4, n_nodes_out=2*n_nodes_in)205data_in = randn(n_vars, n_nodes_in, n_nodes_in)206data_out = randn(n_vars, n_nodes_out, n_nodes_out)207vandermonde_dynamic = randn(n_nodes_out, n_nodes_in)208vandermonde_static = SMatrix{n_nodes_out, n_nodes_in}(vandermonde_dynamic)209vandermonde_mmatrix = MMatrix{n_nodes_out, n_nodes_in}(vandermonde_dynamic)210211println("\n\n# 2D ", "#"^70)212println("n_vars = ", n_vars)213println("n_nodes_in = ", n_nodes_in)214println("n_nodes_out = ", n_nodes_out)215println()216217println("\n","multiply_dimensionwise_sequential!")218println("vandermonde_dynamic")219multiply_dimensionwise_sequential!(data_out, data_in, vandermonde_dynamic)220data_out_copy = copy(data_out)221display(@benchmark multiply_dimensionwise_sequential!($(data_out), $(data_in), $(vandermonde_dynamic)))222println("\n", "vandermonde_static")223multiply_dimensionwise_sequential!(data_out, data_in, vandermonde_static)224@assert data_out ≈ data_out_copy225display(@benchmark multiply_dimensionwise_sequential!($(data_out), $(data_in), $(vandermonde_static)))226println("\n", "vandermonde_mmatrix")227multiply_dimensionwise_sequential!(data_out, data_in, vandermonde_mmatrix)228@assert data_out ≈ data_out_copy229display(@benchmark multiply_dimensionwise_sequential!($(data_out), $(data_in), $(vandermonde_mmatrix)))230println()231232println("\n","multiply_dimensionwise_sequential_avx!")233println("vandermonde_dynamic")234multiply_dimensionwise_sequential_avx!(data_out, data_in, vandermonde_dynamic)235@assert data_out ≈ data_out_copy236display(@benchmark multiply_dimensionwise_sequential_avx!($(data_out), $(data_in), $(vandermonde_dynamic)))237println("\n", "vandermonde_static")238multiply_dimensionwise_sequential_avx!(data_out, data_in, vandermonde_static)239@assert data_out ≈ data_out_copy240display(@benchmark multiply_dimensionwise_sequential_avx!($(data_out), $(data_in), $(vandermonde_static)))241println("\n", "vandermonde_mmatrix")242multiply_dimensionwise_sequential_avx!(data_out, data_in, vandermonde_mmatrix)243@assert data_out ≈ data_out_copy244display(@benchmark multiply_dimensionwise_sequential_avx!($(data_out), $(data_in), $(vandermonde_mmatrix)))245println()246247println("\n","multiply_dimensionwise_sequential_tullio!")248println("vandermonde_dynamic")249multiply_dimensionwise_sequential_tullio!(data_out, data_in, vandermonde_dynamic)250@assert data_out ≈ data_out_copy251display(@benchmark multiply_dimensionwise_sequential_tullio!($(data_out), $(data_in), $(vandermonde_dynamic)))252println("\n", "vandermonde_static")253multiply_dimensionwise_sequential_tullio!(data_out, data_in, vandermonde_static)254@assert data_out ≈ data_out_copy255display(@benchmark multiply_dimensionwise_sequential_tullio!($(data_out), $(data_in), $(vandermonde_static)))256println("\n", "vandermonde_mmatrix")257multiply_dimensionwise_sequential_tullio!(data_out, data_in, vandermonde_mmatrix)258@assert data_out ≈ data_out_copy259display(@benchmark multiply_dimensionwise_sequential_tullio!($(data_out), $(data_in), $(vandermonde_mmatrix)))260println()261262println("\n","multiply_dimensionwise_sequential_nexpr!")263println("vandermonde_static")264multiply_dimensionwise_sequential_nexpr!(data_out, data_in, vandermonde_static, Val(n_vars))265@assert data_out ≈ data_out_copy266display(@benchmark multiply_dimensionwise_sequential_nexpr!($(data_out), $(data_in), $(vandermonde_static), $(Val(n_vars))))267println()268269270println("\n","multiply_dimensionwise_squeezed!")271println("vandermonde_dynamic")272multiply_dimensionwise_squeezed!(data_out, data_in, vandermonde_dynamic)273@assert data_out ≈ data_out_copy274display(@benchmark multiply_dimensionwise_squeezed!($(data_out), $(data_in), $(vandermonde_dynamic)))275println("\n", "vandermonde_static")276multiply_dimensionwise_squeezed!(data_out, data_in, vandermonde_static)277@assert data_out ≈ data_out_copy278display(@benchmark multiply_dimensionwise_squeezed!($(data_out), $(data_in), $(vandermonde_static)))279println("\n", "vandermonde_mmatrix")280multiply_dimensionwise_squeezed!(data_out, data_in, vandermonde_mmatrix)281@assert data_out ≈ data_out_copy282display(@benchmark multiply_dimensionwise_squeezed!($(data_out), $(data_in), $(vandermonde_mmatrix)))283println()284285println("\n","multiply_dimensionwise_squeezed_avx!")286println("vandermonde_dynamic")287multiply_dimensionwise_squeezed_avx!(data_out, data_in, vandermonde_dynamic)288@assert data_out ≈ data_out_copy289display(@benchmark multiply_dimensionwise_squeezed_avx!($(data_out), $(data_in), $(vandermonde_dynamic)))290println("\n", "vandermonde_static")291multiply_dimensionwise_squeezed_avx!(data_out, data_in, vandermonde_static)292@assert data_out ≈ data_out_copy293display(@benchmark multiply_dimensionwise_squeezed_avx!($(data_out), $(data_in), $(vandermonde_static)))294println("\n", "vandermonde_mmatrix")295multiply_dimensionwise_squeezed_avx!(data_out, data_in, vandermonde_mmatrix)296@assert data_out ≈ data_out_copy297display(@benchmark multiply_dimensionwise_squeezed_avx!($(data_out), $(data_in), $(vandermonde_mmatrix)))298println()299300println("\n","multiply_dimensionwise_squeezed_tullio!")301println("vandermonde_dynamic")302multiply_dimensionwise_squeezed_tullio!(data_out, data_in, vandermonde_dynamic)303@assert data_out ≈ data_out_copy304display(@benchmark multiply_dimensionwise_squeezed_tullio!($(data_out), $(data_in), $(vandermonde_dynamic)))305println("\n", "vandermonde_static")306multiply_dimensionwise_squeezed_tullio!(data_out, data_in, vandermonde_static)307@assert data_out ≈ data_out_copy308display(@benchmark multiply_dimensionwise_squeezed_tullio!($(data_out), $(data_in), $(vandermonde_static)))309println("\n", "vandermonde_mmatrix")310multiply_dimensionwise_squeezed_tullio!(data_out, data_in, vandermonde_mmatrix)311@assert data_out ≈ data_out_copy312display(@benchmark multiply_dimensionwise_squeezed_tullio!($(data_out), $(data_in), $(vandermonde_mmatrix)))313println()314315nothing316end317318# TODO319run_benchmarks_2d(4, 4, 4) # n_vars, n_nodes_in, n_nodes_out320321322function compute_benchmarks_2d(n_vars, n_nodes_in, n_nodes_out)323data_in = randn(n_vars, n_nodes_in, n_nodes_in)324data_out = randn(n_vars, n_nodes_out, n_nodes_out)325vandermonde_dynamic = randn(n_nodes_out, n_nodes_in)326vandermonde_static = SMatrix{n_nodes_out, n_nodes_in}(vandermonde_dynamic)327tmp1 = zeros(eltype(data_out), n_vars, n_nodes_out, n_nodes_in)328329println("n_vars = ", n_vars, "; n_nodes_in = ", n_nodes_in, "; n_nodes_out = ", n_nodes_out)330sequential_dynamic = @benchmark multiply_dimensionwise_sequential_tullio!($(data_out), $(data_in), $(vandermonde_dynamic))331sequential_static = @benchmark multiply_dimensionwise_sequential_tullio!($(data_out), $(data_in), $(vandermonde_static))332#FIXME sequential_nexpr = @benchmark multiply_dimensionwise_sequential_nexpr!($(data_out), $(data_in), $(vandermonde_static), $(Val(n_vars)))333sequential_dynamic_prealloc = @benchmark multiply_dimensionwise_sequential_tullio!($(data_out), $(data_in), $(vandermonde_dynamic), $(tmp1))334sequential_static_prealloc = @benchmark multiply_dimensionwise_sequential_tullio!($(data_out), $(data_in), $(vandermonde_static), $(tmp1))335squeezed_dynamic = @benchmark multiply_dimensionwise_squeezed_tullio!($(data_out), $(data_in), $(vandermonde_dynamic))336squeezed_static = @benchmark multiply_dimensionwise_squeezed_tullio!($(data_out), $(data_in), $(vandermonde_static))337338return time(median(sequential_dynamic)),339time(median(sequential_static)),340NaN, #FIXME time(median(sequential_nexpr)),341time(median(sequential_dynamic_prealloc)),342time(median(sequential_static_prealloc)),343time(median(squeezed_dynamic)),344time(median(squeezed_static))345end346347function compute_benchmarks_2d(n_vars_list, n_nodes_in_list)348sequential_dynamic = zeros(length(n_vars_list), length(n_nodes_in_list))349sequential_static = zeros(length(n_vars_list), length(n_nodes_in_list))350sequential_nexpr = zeros(length(n_vars_list), length(n_nodes_in_list))351sequential_dynamic_prealloc = zeros(length(n_vars_list), length(n_nodes_in_list))352sequential_static_prealloc = zeros(length(n_vars_list), length(n_nodes_in_list))353squeezed_dynamic = zeros(length(n_vars_list), length(n_nodes_in_list))354squeezed_static = zeros(length(n_vars_list), length(n_nodes_in_list))355356# n_nodes_out = n_nodes_in357# mortar358# superset of n_vars = 1, n_nodes_out = n_nodes_in, used for blending359for (idx_nodes, n_nodes_in) in enumerate(n_nodes_in_list)360for (idx_variables, n_vars) in enumerate(n_vars_list)361n_nodes_out = n_nodes_in362sequential_dynamic[idx_variables, idx_nodes],363sequential_static[idx_variables, idx_nodes],364sequential_nexpr[idx_variables, idx_nodes],365sequential_dynamic_prealloc[idx_variables, idx_nodes],366sequential_static_prealloc[idx_variables, idx_nodes],367squeezed_dynamic[idx_variables, idx_nodes],368squeezed_static[idx_variables, idx_nodes] =369compute_benchmarks_2d(n_vars, n_nodes_in, n_nodes_out)370end371end372BSON.@save "2D_nVarTotal_nNodesIn.bson" n_vars_list n_nodes_in_list sequential_dynamic sequential_static sequential_nexpr sequential_dynamic_prealloc sequential_static_prealloc squeezed_dynamic squeezed_static373374# n_nodes_out = 2*n_nodes_in375# visualization376for (idx_nodes, n_nodes_in) in enumerate(n_nodes_in_list)377for (idx_variables, n_vars) in enumerate(n_vars_list)378n_nodes_out = 2 * n_nodes_in379sequential_dynamic[idx_variables, idx_nodes],380sequential_static[idx_variables, idx_nodes],381sequential_nexpr[idx_variables, idx_nodes],382sequential_dynamic_prealloc[idx_variables, idx_nodes],383sequential_static_prealloc[idx_variables, idx_nodes],384squeezed_dynamic[idx_variables, idx_nodes],385squeezed_static[idx_variables, idx_nodes] =386compute_benchmarks_2d(n_vars, n_nodes_in, n_nodes_out)387end388end389BSON.@save "2D_nVarTotal_2nNodesIn.bson" n_vars_list n_nodes_in_list sequential_dynamic sequential_static sequential_nexpr sequential_dynamic_prealloc sequential_static_prealloc squeezed_dynamic squeezed_static390391return nothing392end393394# TODO395compute_benchmarks_2d(1:10, 2:10)396397398###################################################################################################399# 3D versions400401function multiply_dimensionwise_sequential!(402data_out::AbstractArray{<:Any, 4}, data_in::AbstractArray{<:Any, 4}, vandermonde,403tmp1=zeros(eltype(data_out), size(data_out, 1), size(vandermonde, 1), size(vandermonde, 2), size(vandermonde, 2)),404tmp2=zeros(eltype(data_out), size(data_out, 1), size(vandermonde, 1), size(vandermonde, 1), size(vandermonde, 2)))405n_vars = size(data_out, 1)406n_nodes_out = size(vandermonde, 1)407n_nodes_in = size(vandermonde, 2)408data_out .= zero(eltype(data_out))409410@boundscheck begin411inbounds = (size(data_out, 2) == size(data_out, 3) == size(data_out, 4) == n_nodes_out) &&412(size(data_in, 1) == n_vars) &&413(size(data_in, 2) == size(data_in, 3) == size(data_in, 4) == n_nodes_in)414inbounds || throw(BoundsError())415end416417# Interpolate in x-direction418@inbounds for k in 1:n_nodes_in, j in 1:n_nodes_in, i in 1:n_nodes_out419for ii in 1:n_nodes_in420for v in 1:n_vars421tmp1[v, i, j, k] += vandermonde[i, ii] * data_in[v, ii, j, k]422end423end424end425426# Interpolate in y-direction427@inbounds for k in 1:n_nodes_in, j in 1:n_nodes_out, i in 1:n_nodes_out428for jj in 1:n_nodes_in429for v in 1:n_vars430tmp2[v, i, j, k] += vandermonde[j, jj] * tmp1[v, i, jj, k]431end432end433end434435# Interpolate in z-direction436@inbounds for k in 1:n_nodes_out, j in 1:n_nodes_out, i in 1:n_nodes_out437for kk in 1:n_nodes_in438for v in 1:n_vars439data_out[v, i, j, k] += vandermonde[k, kk] * tmp2[v, i, j, kk]440end441end442end443444return data_out445end446447function multiply_dimensionwise_sequential_avx!(448data_out::AbstractArray{<:Any, 4}, data_in::AbstractArray{<:Any, 4}, vandermonde,449tmp1=zeros(eltype(data_out), size(data_out, 1), size(vandermonde, 1), size(vandermonde, 2), size(vandermonde, 2)),450tmp2=zeros(eltype(data_out), size(data_out, 1), size(vandermonde, 1), size(vandermonde, 1), size(vandermonde, 2)))451n_vars = size(data_out, 1)452n_nodes_out = size(vandermonde, 1)453n_nodes_in = size(vandermonde, 2)454data_out .= zero(eltype(data_out))455456@boundscheck begin457inbounds = (size(data_out, 2) == size(data_out, 3) == size(data_out, 4) == n_nodes_out) &&458(size(data_in, 1) == n_vars) &&459(size(data_in, 2) == size(data_in, 3) == size(data_in, 4) == n_nodes_in)460inbounds || throw(BoundsError())461end462463# Interpolate in x-direction464@avx for k in 1:n_nodes_in, j in 1:n_nodes_in, i in 1:n_nodes_out465for ii in 1:n_nodes_in466for v in 1:n_vars467tmp1[v, i, j, k] += vandermonde[i, ii] * data_in[v, ii, j, k]468end469end470end471472# Interpolate in y-direction473@avx for k in 1:n_nodes_in, j in 1:n_nodes_out, i in 1:n_nodes_out474for jj in 1:n_nodes_in475for v in 1:n_vars476tmp2[v, i, j, k] += vandermonde[j, jj] * tmp1[v, i, jj, k]477end478end479end480481# Interpolate in z-direction482@avx for k in 1:n_nodes_out, j in 1:n_nodes_out, i in 1:n_nodes_out483for kk in 1:n_nodes_in484for v in 1:n_vars485data_out[v, i, j, k] += vandermonde[k, kk] * tmp2[v, i, j, kk]486end487end488end489490return data_out491end492493function multiply_dimensionwise_sequential_tullio!(494data_out::AbstractArray{<:Any, 4}, data_in::AbstractArray{<:Any, 4}, vandermonde,495tmp1=zeros(eltype(data_out), size(data_out, 1), size(vandermonde, 1), size(vandermonde, 2), size(vandermonde, 2)),496tmp2=zeros(eltype(data_out), size(data_out, 1), size(vandermonde, 1), size(vandermonde, 1), size(vandermonde, 2)))497498# Interpolate in x-direction499@tullio threads=false tmp1[v, i, j, k] = vandermonde[i, ii] * data_in[v, ii, j, k]500501# Interpolate in y-direction502@tullio threads=false tmp2[v, i, j, k] = vandermonde[j, jj] * tmp1[v, i, jj, k]503504# Interpolate in z-direction505@tullio threads=false data_out[v, i, j, k] = vandermonde[k, kk] * tmp2[v, i, j, kk]506507return data_out508end509510@generated function multiply_dimensionwise_sequential_nexpr!(511data_out::AbstractArray{<:Any, 4}, data_in::AbstractArray{<:Any, 4}, vandermonde::SMatrix{n_nodes_out,n_nodes_in}, ::Val{n_vars}) where {n_nodes_out, n_nodes_in, n_vars}512quote513@boundscheck begin514inbounds = (size(data_out, 1) == $n_vars) &&515(size(data_out, 2) == size(data_out, 3) == size(data_out, 4) == $n_nodes_out) &&516(size(data_in, 1) == $n_vars) &&517(size(data_in, 2) == size(data_in, 3) == size(data_in, 4) == $n_nodes_in)518inbounds || throw(BoundsError())519end520521# Interpolate in x-direction522@inbounds @muladd Base.Cartesian.@nexprs $n_nodes_in k -> begin523Base.Cartesian.@nexprs $n_nodes_in j -> begin524Base.Cartesian.@nexprs $n_nodes_out i -> begin525Base.Cartesian.@nexprs $n_vars v -> begin526tmp1_v_i_j_k = zero(eltype(data_out))527Base.Cartesian.@nexprs $n_nodes_in ii -> begin528tmp1_v_i_j_k += vandermonde[i, ii] * data_in[v, ii, j, k]529end530end531end532end533end534535# Interpolate in y-direction536@inbounds @muladd Base.Cartesian.@nexprs $n_nodes_in k -> begin537Base.Cartesian.@nexprs $n_nodes_out j -> begin538Base.Cartesian.@nexprs $n_nodes_out i -> begin539Base.Cartesian.@nexprs $n_vars v -> begin540tmp2_v_i_j_k = zero(eltype(data_out))541Base.Cartesian.@nexprs $n_nodes_in jj -> begin542tmp2_v_i_j_k += vandermonde[j, jj] * tmp1_v_i_jj_k543end544end545end546end547end548549# Interpolate in z-direction550@inbounds @muladd Base.Cartesian.@nexprs $n_nodes_out k -> begin551Base.Cartesian.@nexprs $n_nodes_out j -> begin552Base.Cartesian.@nexprs $n_nodes_out i -> begin553Base.Cartesian.@nexprs $n_vars v -> begin554tmp3_v_i_j_k = zero(eltype(data_out))555Base.Cartesian.@nexprs $n_nodes_in kk -> begin556tmp3_v_i_j_k += vandermonde[k, kk] * tmp2_v_i_j_kk557end558data_out[v, i, j, k] = tmp3_v_i_j_k559end560end561end562end563564return data_out565end566end567568569function multiply_dimensionwise_squeezed!(570data_out::AbstractArray{<:Any, 4}, data_in::AbstractArray{<:Any, 4}, vandermonde)571n_vars = size(data_out, 1)572n_nodes_out = size(vandermonde, 1)573n_nodes_in = size(vandermonde, 2)574data_out .= zero(eltype(data_out))575576@boundscheck begin577inbounds = (size(data_out, 2) == size(data_out, 3) == size(data_out, 4) == n_nodes_out) &&578(size(data_in, 1) == n_vars) &&579(size(data_in, 2) == size(data_in, 3) == size(data_in, 4) == n_nodes_in)580inbounds || throw(BoundsError())581end582583@inbounds for k in 1:n_nodes_out, j in 1:n_nodes_out, i in 1:n_nodes_out584for v in 1:n_vars585acc = zero(eltype(data_out))586for kk in 1:n_nodes_in, jj in 1:n_nodes_in, ii in 1:n_nodes_in587acc += vandermonde[i, ii] * vandermonde[j, jj] * vandermonde[k, kk] * data_in[v, ii, jj, kk]588end589data_out[v, i, j, k] = acc590end591end592593return data_out594end595596function multiply_dimensionwise_squeezed_avx!(597data_out::AbstractArray{<:Any, 4}, data_in::AbstractArray{<:Any, 4}, vandermonde)598n_vars = size(data_out, 1)599n_nodes_out = size(vandermonde, 1)600n_nodes_in = size(vandermonde, 2)601data_out .= zero(eltype(data_out))602603@boundscheck begin604inbounds = (size(data_out, 2) == size(data_out, 3) == size(data_out, 4) == n_nodes_out) &&605(size(data_in, 1) == n_vars) &&606(size(data_in, 2) == size(data_in, 3) == size(data_in, 4) == n_nodes_in)607inbounds || throw(BoundsError())608end609610@avx for k in 1:n_nodes_out, j in 1:n_nodes_out, i in 1:n_nodes_out611for v in 1:n_vars612acc = zero(eltype(data_out))613for kk in 1:n_nodes_in, jj in 1:n_nodes_in, ii in 1:n_nodes_in614acc += vandermonde[i, ii] * vandermonde[j, jj] * vandermonde[k, kk] * data_in[v, ii, jj, kk]615end616data_out[v, i, j, k] = acc617end618end619620return data_out621end622623function multiply_dimensionwise_squeezed_tullio!(624data_out::AbstractArray{<:Any, 4}, data_in::AbstractArray{<:Any, 4}, vandermonde)625626@tullio threads=false data_out[v, i, j, k] = vandermonde[i, ii] * vandermonde[j, jj] * vandermonde[k, kk] * data_in[v, ii, jj, kk]627628return data_out629end630631632function run_benchmarks_3d(n_vars=4, n_nodes_in=4, n_nodes_out=2*n_nodes_in)633data_in = randn(n_vars, n_nodes_in, n_nodes_in, n_nodes_in)634data_out = randn(n_vars, n_nodes_out, n_nodes_out, n_nodes_out)635vandermonde_dynamic = randn(n_nodes_out, n_nodes_in)636vandermonde_static = SMatrix{n_nodes_out, n_nodes_in}(vandermonde_dynamic)637638println("\n\n# 3D ", "#"^70)639println("n_vars = ", n_vars)640println("n_nodes_in = ", n_nodes_in)641println("n_nodes_out = ", n_nodes_out)642println()643644println("\n","multiply_dimensionwise_sequential!")645println("vandermonde_dynamic")646multiply_dimensionwise_sequential!(data_out, data_in, vandermonde_dynamic)647data_out_copy = copy(data_out)648display(@benchmark multiply_dimensionwise_sequential!($(data_out), $(data_in), $(vandermonde_dynamic)))649println("\n", "vandermonde_static")650multiply_dimensionwise_sequential!(data_out, data_in, vandermonde_static)651@assert data_out ≈ data_out_copy652display(@benchmark multiply_dimensionwise_sequential!($(data_out), $(data_in), $(vandermonde_static)))653println()654655println("\n","multiply_dimensionwise_sequential_avx!")656println("vandermonde_dynamic")657multiply_dimensionwise_sequential_avx!(data_out, data_in, vandermonde_dynamic)658@assert data_out ≈ data_out_copy659display(@benchmark multiply_dimensionwise_sequential_avx!($(data_out), $(data_in), $(vandermonde_dynamic)))660println("\n", "vandermonde_static")661multiply_dimensionwise_sequential_avx!(data_out, data_in, vandermonde_static)662@assert data_out ≈ data_out_copy663display(@benchmark multiply_dimensionwise_sequential_avx!($(data_out), $(data_in), $(vandermonde_static)))664println()665666println("\n","multiply_dimensionwise_sequential_tullio!")667println("vandermonde_dynamic")668multiply_dimensionwise_sequential_tullio!(data_out, data_in, vandermonde_dynamic)669@assert data_out ≈ data_out_copy670display(@benchmark multiply_dimensionwise_sequential_tullio!($(data_out), $(data_in), $(vandermonde_dynamic)))671println("\n", "vandermonde_static")672multiply_dimensionwise_sequential_tullio!(data_out, data_in, vandermonde_static)673@assert data_out ≈ data_out_copy674display(@benchmark multiply_dimensionwise_sequential_tullio!($(data_out), $(data_in), $(vandermonde_static)))675println()676677println("\n","multiply_dimensionwise_sequential_nexpr!")678println("vandermonde_static")679multiply_dimensionwise_sequential_nexpr!(data_out, data_in, vandermonde_static, Val(n_vars))680@assert data_out ≈ data_out_copy681display(@benchmark multiply_dimensionwise_sequential_nexpr!($(data_out), $(data_in), $(vandermonde_static), $(Val(n_vars))))682println()683684println("\n","multiply_dimensionwise_squeezed!")685println("vandermonde_dynamic")686multiply_dimensionwise_squeezed!(data_out, data_in, vandermonde_dynamic)687@assert data_out ≈ data_out_copy688display(@benchmark multiply_dimensionwise_squeezed!($(data_out), $(data_in), $(vandermonde_dynamic)))689println("\n", "vandermonde_static")690multiply_dimensionwise_squeezed!(data_out, data_in, vandermonde_static)691@assert data_out ≈ data_out_copy692display(@benchmark multiply_dimensionwise_squeezed!($(data_out), $(data_in), $(vandermonde_static)))693println()694695println("\n","multiply_dimensionwise_squeezed_avx!")696println("vandermonde_dynamic")697multiply_dimensionwise_squeezed_avx!(data_out, data_in, vandermonde_dynamic)698@assert data_out ≈ data_out_copy699display(@benchmark multiply_dimensionwise_squeezed_avx!($(data_out), $(data_in), $(vandermonde_dynamic)))700println("\n", "vandermonde_static")701multiply_dimensionwise_squeezed_avx!(data_out, data_in, vandermonde_static)702@assert data_out ≈ data_out_copy703display(@benchmark multiply_dimensionwise_squeezed_avx!($(data_out), $(data_in), $(vandermonde_static)))704println()705706println("\n","multiply_dimensionwise_squeezed_tullio!")707println("vandermonde_dynamic")708multiply_dimensionwise_squeezed_tullio!(data_out, data_in, vandermonde_dynamic)709@assert data_out ≈ data_out_copy710display(@benchmark multiply_dimensionwise_squeezed_tullio!($(data_out), $(data_in), $(vandermonde_dynamic)))711println("\n", "vandermonde_static")712multiply_dimensionwise_squeezed_tullio!(data_out, data_in, vandermonde_static)713@assert data_out ≈ data_out_copy714display(@benchmark multiply_dimensionwise_squeezed_tullio!($(data_out), $(data_in), $(vandermonde_static)))715println()716717nothing718end719720# TODO721run_benchmarks_3d(5, 4, 4) # n_vars, n_nodes_in, n_nodes_out722723724function compute_benchmarks_3d(n_vars, n_nodes_in, n_nodes_out)725data_in = randn(n_vars, n_nodes_in, n_nodes_in, n_nodes_in)726data_out = randn(n_vars, n_nodes_out, n_nodes_out, n_nodes_out)727vandermonde_dynamic = randn(n_nodes_out, n_nodes_in)728vandermonde_static = SMatrix{n_nodes_out, n_nodes_in}(vandermonde_dynamic)729tmp1 = zeros(eltype(data_out), n_vars, n_nodes_out, n_nodes_in, n_nodes_in)730tmp2 = zeros(eltype(data_out), n_vars, n_nodes_out, n_nodes_out, n_nodes_in)731732println("n_vars = ", n_vars, "; n_nodes_in = ", n_nodes_in, "; n_nodes_out = ", n_nodes_out)733sequential_dynamic = @benchmark multiply_dimensionwise_sequential_tullio!($(data_out), $(data_in), $(vandermonde_dynamic))734sequential_static = @benchmark multiply_dimensionwise_sequential_tullio!($(data_out), $(data_in), $(vandermonde_static))735#FIXME sequential_nexpr = @benchmark multiply_dimensionwise_sequential_nexpr!($(data_out), $(data_in), $(vandermonde_static), $(Val(n_vars)))736sequential_dynamic_prealloc = @benchmark multiply_dimensionwise_sequential_tullio!($(data_out), $(data_in), $(vandermonde_dynamic), $(tmp1), $(tmp2))737sequential_static_prealloc = @benchmark multiply_dimensionwise_sequential_tullio!($(data_out), $(data_in), $(vandermonde_static), $(tmp1), $(tmp2))738squeezed_dynamic = @benchmark multiply_dimensionwise_squeezed_tullio!($(data_out), $(data_in), $(vandermonde_dynamic))739squeezed_static = @benchmark multiply_dimensionwise_squeezed_tullio!($(data_out), $(data_in), $(vandermonde_static))740741return time(median(sequential_dynamic)),742time(median(sequential_static)),743NaN, #FIXME time(median(sequential_nexpr)),744time(median(sequential_dynamic_prealloc)),745time(median(sequential_static_prealloc)),746time(median(squeezed_dynamic)),747time(median(squeezed_static))748end749750function compute_benchmarks_3d(n_vars_list, n_nodes_in_list)751sequential_dynamic = zeros(length(n_vars_list), length(n_nodes_in_list))752sequential_static = zeros(length(n_vars_list), length(n_nodes_in_list))753sequential_nexpr = zeros(length(n_vars_list), length(n_nodes_in_list))754sequential_dynamic_prealloc = zeros(length(n_vars_list), length(n_nodes_in_list))755sequential_static_prealloc = zeros(length(n_vars_list), length(n_nodes_in_list))756squeezed_dynamic = zeros(length(n_vars_list), length(n_nodes_in_list))757squeezed_static = zeros(length(n_vars_list), length(n_nodes_in_list))758759# n_nodes_out = n_nodes_in760# mortar761# superset of n_vars = 1, n_nodes_out = n_nodes_in, used for blending762for (idx_nodes, n_nodes_in) in enumerate(n_nodes_in_list)763for (idx_variables, n_vars) in enumerate(n_vars_list)764n_nodes_out = n_nodes_in765sequential_dynamic[idx_variables, idx_nodes],766sequential_static[idx_variables, idx_nodes],767sequential_nexpr[idx_variables, idx_nodes],768sequential_dynamic_prealloc[idx_variables, idx_nodes],769sequential_static_prealloc[idx_variables, idx_nodes],770squeezed_dynamic[idx_variables, idx_nodes],771squeezed_static[idx_variables, idx_nodes] =772compute_benchmarks_3d(n_vars, n_nodes_in, n_nodes_out)773end774end775BSON.@save "3D_nVarTotal_nNodesIn.bson" n_vars_list n_nodes_in_list sequential_dynamic sequential_static sequential_nexpr sequential_dynamic_prealloc sequential_static_prealloc squeezed_dynamic squeezed_static776777# TODO deactivated to save some time778# # n_nodes_out = 2*n_nodes_in779# # visualization780# title = "n_vars = 2*n_vars, n_nodes_out = n_nodes_in"781# for (idx_nodes, n_nodes_in) in enumerate(n_nodes_in_list)782# for (idx_variables, n_vars) in enumerate(n_vars_list)783# n_nodes_out = 2 * n_nodes_in784# sequential_dynamic[idx_variables, idx_nodes],785# sequential_static[idx_variables, idx_nodes],786# sequential_nexpr[idx_variables, idx_nodes],787# sequential_dynamic_prealloc[idx_variables, idx_nodes],788# sequential_static_prealloc[idx_variables, idx_nodes],789# squeezed_dynamic[idx_variables, idx_nodes],790# squeezed_static[idx_variables, idx_nodes] =791# compute_benchmarks_3d(n_vars, n_nodes_in, n_nodes_out)792# end793# end794# BSON.@save "3D_nVarTotal_2nNodesIn.bson" n_vars_list n_nodes_in_list sequential_dynamic sequential_static sequential_nexpr sequential_dynamic_prealloc sequential_static_prealloc squeezed_dynamic squeezed_static795796return nothing797end798799800# TODO801compute_benchmarks_3d(1:10, 2:10)802803804