# 一段模拟扩散的代码，目前的速度大概只有matlab版本的七成，恳请进一步提高性能。

``````using VectorizedRoutines, Random, FFTW, Printf, MAT

function f_def_field(itype)
if itype == 1
f_def = @. zeros(N[1], N[2], N[3]) + 0.05;
elseif itype == 2
angle1, λ_1, A_1, C_1 = + π / 12, 256 * δx, 0.05, 0.0;
angle2, λ_2, A_2, C_2 = - π / 12, 64  * δx, 0.05, 0.0;
xx = δx .* (1:N[2])';
f_def1 = zeros(N[1], N[2], N[3]);
f_def2 = zeros(N[1], N[2], N[3]);
for i in 1:N[1], j in 1:N[2]
f_def1[i,j,1] =  A_1 * (1 + sin(2π * (xx[i]/λ_1 - xx[j]/λ_2))) + C_1;
f_def2[i,j,1] =  A_2 * (1 + sin(2π * (xx[i]/λ_1 + xx[j]/λ_2))) + C_2;
end
f_def = zeros(N[1], N[2], N[3]);
for i in 1:N[1], j in 1:N[2], k in 1:N[3]
f_def[i,j,k] = max(f_def1[i,j,k], f_def2[i,j,k]);
end
end
return f_def
end

function main!(n, η_rex, η_def, sumetasqu, f_def_force)
start_time = time();

#    pre-allocations
η_rex_fft = zeros(Complex{Float64}, N[1], N[2], N[3]);
η_def_fft = zeros(Complex{Float64}, N[1], N[2], N[3]);
tem       = zeros(Complex{Float64}, N[1], N[2], N[3]);

for k = 1:n
η_rex_fft, η_def_fft = fft(η_rex), fft(η_def);
tem = fft(@. -4 / 3 * M / l_gb *
(6σ_gb / l_gb * (η_rex^3 - η_rex + 3η_rex * η_def^2) -
2η_rex * η_def^2 / sumetasqu^2 * f_def_force));
η_rex = real(ifft(@. η_rex_fft + δt * (-M * σ_gb * k2 * η_rex_fft + tem)));

tem = fft(@. -4 / 3 * M / l_gb *
(6σ_gb / l_gb * (η_def^3 - η_def + 3η_def * η_rex^2) +
2η_def * η_rex^2 / sumetasqu^2 * f_def_force));
η_def = real(ifft(@. η_def_fft + δt * (-M * σ_gb * k2 * η_def_fft + tem)));

sumetasqu = @. η_rex^2 + η_def^2;
f_def_force = @. f_def * (η_def^2 / sumetasqu);

if mod(k, 10) == 0
println("iterationstep  ", k)
end
end
end

## 主程序
const n = 100;
const N    = [1024  1024  1];
const M    = 1.;
const v_gb = 1.;
const δx   = 1.;
const δt   = 0.05;
l_gb = sqrt(9.6) * δx;
const σ_gb = 0.32;

f_def = f_def_field(2);
f_def_force = f_def;

η_rex = zeros(N[1], N[2], N[3]);
η_rex[1:5, :, :] .= 1.;
η_def = @. 1. - η_rex;
sumetasqu = @. η_rex^2 + η_def^2;
ϕ_def = @. η_def^2 / sumetasqu;
f_def_force  = @. f_def_force * ϕ_def;
g   = @. 2π / N[1] / δx * [ 0:N[1] / 2; -N[1] / 2 + 1:-1 ];
gg  = @. 2π / N[2] / δx * [ 0:N[2] / 2; -N[2] / 2 + 1:-1 ];
ggg = @. 2π / N[3] / δx * [ 0:N[3] / 2; -N[3] / 2 + 1:-1 ];
g1, g2, g3 = Matlab.meshgrid(g, gg, ggg);
k2 = @. g1^2 + g2^2 + g3^2;

start_time = time();
main!(n, η_rex, η_def, sumetasqu, f_def_force)
println("time used  ", time() - start_time)
``````

`main!` 中的循环里都是新分配内存，这样很慢，能改成预分配内存，然后就地修改(in-place)就能更快一些。

``````  37.720 s (39000 allocations: 23.44 GiB)

26.686 s (26848 allocations: 146.32 MiB)
``````

• `f_def_force = f_def;` 这一句在julia里就是取别名，两个符号指向同一块内存。
我这里分开了。f_def 的值一直保持不变。
• 主循环中的代码时对称的，上下两块只是交换了 η_rex 与 η_def

``````using VectorizedRoutines
using FFTW
using BenchmarkTools

"""计算参数（不可变）

mutable struct _StructName
# ...
end

"""
struct Params
# 迭代次数
iter_times :: Int64
# 各方向维度
dim :: NTuple{3, Int64}

M       :: Float64
# not use
v_gb    :: Float64
δx      :: Float64
δt      :: Float64
l_gb    :: Float64
σ_gb    :: Float64

k2 :: Array{Float64,3}
f_def :: Array{Float64,3}
end

function f_def_field(itype, dim, δx)
nx, ny, nz = dim
f_def = zeros(dim)

if itype == 1
f_def = @. f_def + 0.05
elseif itype == 2
angle1, λ_1, A_1, C_1 = + π / 12, 256 * δx, 0.05, 0.0
angle2, λ_2, A_2, C_2 = - π / 12, 64  * δx, 0.05, 0.0
xx = δx .* (1:ny)'
f_def1 = zeros(dim)
f_def2 = zeros(dim)
for i in 1:nx, j in 1:ny
f_def1[i,j,1] =  A_1 * (1 + sin(2π * (xx[i]/λ_1 - xx[j]/λ_2))) + C_1
f_def2[i,j,1] =  A_2 * (1 + sin(2π * (xx[i]/λ_1 + xx[j]/λ_2))) + C_2
end
for i in 1:nx, j in 1:ny, k in 1:nz
f_def[i,j,k] = max(f_def1[i,j,k], f_def2[i,j,k])
end
end

f_def
end

"""初始化参数

+ iter_times: 迭代次数
+ dim: 空间维度
"""
function init_params(iter_times = 100, dim = (512, 512, 1))
n    = iter_times
nx, ny, nz = dim
M    = 1.
v_gb = 1.
δx   = 1.
δt   = 0.05
l_gb = sqrt(9.6) * δx
σ_gb = 0.32

f_def = f_def_field(2, dim, δx)
f_def_force = f_def

η_rex = zeros(dim)
η_rex[1:5, :, :] .= 1.
η_def = @. 1. - η_rex
sumetasqu = @. η_rex^2 + η_def^2
ϕ_def = @. η_def^2 / sumetasqu
f_def_force  = @. f_def_force * ϕ_def
g   = @. 2π / nx / δx * [ 0:nx / 2; -nx / 2 + 1:-1 ]
gg  = @. 2π / ny / δx * [ 0:ny / 2; -ny / 2 + 1:-1 ]
ggg = @. 2π / nz / δx * [ 0:nz / 2; -nz / 2 + 1:-1 ]
g1, g2, g3 = Matlab.meshgrid(g, gg, ggg)
k2 = @. g1^2 + g2^2 + g3^2

(
η_rex, η_def, sumetasqu, f_def_force,
Params(iter_times, dim, M, v_gb, δx,δt, l_gb,σ_gb, k2, f_def)
)
end

#= 主要的计算函数 =#

help_kernel1!(tmp::Array{T2, N},
η1::Array{T1, N}, η2::Array{T1, N},
sumetasqu::Array{T1, N}, f_def_force::Array{T1, N},
p::Params,
) where {T1<:Real, T2<:Complex, N} =
@. tmp = -4/3 * p.M / p.l_gb *
(6 * p.σ_gb/p.l_gb * (η1^3 - η1 + 3*η1*η2^2) -
2 * η1 * η2^2 / sumetasqu^2 * f_def_force)

help_kernel2!(tmp::Array{T, N},
cη1::Array{T, N}, cη2::Array{T, N},
p::Params,
) where {T<:Complex, N} =
@. tmp = cη1 + p.δt * (-p.M * p.σ_gb * p.k2 * cη2 + tmp)

"复制 src 的实数部分到 des"
copyre!(des::Array{T, N}, src::Array{Complex{T}, N}) where {T<:Real, N} =
map!(real, des, src)

function help_kernel0!(tmp::Array{T2, N},
η1::Array{T1, N}, cη1::Array{T2, N},
η2::Array{T1, N}, cη2::Array{T2, N},
sumetasqu::Array{T1, N},
f_def_force::Array{T1, N},
p :: Params,
) where {T1<:Real, T2<:Complex, N}
# 计算 tmp
help_kernel1!(tmp, η1, η2, sumetasqu, f_def_force, p)
fft!(tmp)

# 计算 cη1 和 cη2 的 fft
fft!(cη1)
fft!(cη2)

# 计算 ifft, 复制实数部分到 η1, η2
help_kernel2!(tmp, cη1, cη2, p)
ifft!(tmp)
copyre!(η1, cη1)
copyre!(η2, cη2)
end

"for 循环中的主体函数"
function main_kernel!(tmp::Array{T2, N},
η_rex::Array{T1, N}, cη_rex::Array{T2, N},
η_def::Array{T1, N}, cη_def::Array{T2, N},
sumetasqu::Array{T1, N}, f_def_force::Array{T1, N},
p :: Params,
) where {T1<:Real, T2<:Complex, N}
help_kernel0!(tmp, η_rex, cη_rex, η_def, cη_def, sumetasqu, f_def_force, p)
help_kernel0!(tmp, η_def, cη_def, η_rex, cη_rex, sumetasqu, f_def_force, p)

@. sumetasqu = η_rex^2 + η_def^2
@. f_def_force = p.f_def * (η_def^2 / sumetasqu)
end

"主函数"
function main(iter_times = 100, dim = (512, 512, 1))
η_rex, η_def, sumetasqu, f_def_force, p = init_params(iter_times, dim)
cη_rex, cη_def = complex.([η_rex, η_def])
tmp = similar(cη_rex)
for i in 1:p.iter_times
main_kernel!(tmp, η_rex, cη_rex, η_def, cη_def, sumetasqu, f_def_force, p)
end
η_rex, η_def, sumetasqu, f_def_force
end

# 运行主函数
@btime main(100, (1024, 1024, 1))
# main()
``````

1赞

julia 还有一个类型稳定的问题，这个用 `@code_warntype` 看一看，慢慢就有感觉了。

``````@btime main(100, (1024, 1024, 1))
29.363 s (8238 allocations: 264.55 MiB)
@btime main(100, (2048, 2048, 1))
108.976 s (8642 allocations: 1.03 GiB)
@btime main(1000, (1024, 1024, 1))
264.443 s (81491 allocations: 268.91 MiB)
``````

``````using VectorizedRoutines
using FFTW
using CUDA
using BenchmarkTools

const USE_CUDA = true

const Arr3{T} = Array{T, 3}
const cmA3{T} = Array{Complex{T}, 3}
const cuA3{T} = CuArray{Complex{T}, 3}

"""计算参数（不可变）

mutable struct

"""
struct Params{T}
# 迭代次数
iter_times :: Int64
# 各方向维度
dim :: NTuple{3, Int64}

M       :: T
# not use
v_gb    :: T
δx      :: T
δt      :: T
l_gb    :: T
σ_gb    :: T

k2 :: Arr3{T}
f_def :: Arr3{T}
end

struct TmpVar{T}
re :: Arr3{T}
fft :: cmA3{T}
cu :: cuA3{T}

TmpVar(arr::Arr3{T}) where {T <: Real} =
new{T}(arr, complex(arr), CuArray{Complex{T}}(arr))
end

"""存放变量

"""
struct Var{T}
rex   :: TmpVar{T}
def   :: TmpVar{T}
tmp   :: TmpVar{T}

sumetasqu   :: TmpVar{T}
f_def_force :: TmpVar{T}

function Var(
η_rex::Arr3{T}, η_def::Arr3{T},
sumetasqu::Arr3{T}, f_def_force::Arr3{T}
) where {T <: Real}
new{T}(
TmpVar(η_rex), TmpVar(η_def),
TmpVar(similar(η_rex)),
TmpVar(sumetasqu), TmpVar(f_def_force)
)
end
end

function f_def_field(itype, dim, δx)
nx, ny, nz = dim
f_def = zeros(dim)

if itype == 1
f_def = @. f_def + 0.05
elseif itype == 2
angle1, λ_1, A_1, C_1 = + π / 12, 256 * δx, 0.05, 0.0
angle2, λ_2, A_2, C_2 = - π / 12, 64  * δx, 0.05, 0.0
xx = δx .* (1:ny)'
f_def1 = zeros(dim)
f_def2 = zeros(dim)
for i in 1:nx, j in 1:ny
f_def1[i,j,1] =  A_1 * (1 + sin(2π * (xx[i]/λ_1 - xx[j]/λ_2))) + C_1
f_def2[i,j,1] =  A_2 * (1 + sin(2π * (xx[i]/λ_1 + xx[j]/λ_2))) + C_2
end
for i in 1:nx, j in 1:ny, k in 1:nz
f_def[i,j,k] = max(f_def1[i,j,k], f_def2[i,j,k])
end
end

f_def
end

"""初始化参数

+ iter_times: 迭代次数
+ dim: 空间维度
"""
function init_params(iter_times = 100, dim = (512, 512, 1))
n    = iter_times
nx, ny, nz = dim
M    = 1.
v_gb = 1.
δx   = 1.
δt   = 0.05
l_gb = sqrt(9.6) * δx
σ_gb = 0.32

f_def = f_def_field(2, dim, δx)
f_def_force = f_def

η_rex = zeros(dim)
η_rex[1:5, :, :] .= 1.
η_def = @. 1. - η_rex
sumetasqu = @. η_rex^2 + η_def^2
ϕ_def = @. η_def^2 / sumetasqu
f_def_force  = @. f_def_force * ϕ_def
g   = @. 2π / nx / δx * [ 0:nx / 2; -nx / 2 + 1:-1 ]
gg  = @. 2π / ny / δx * [ 0:ny / 2; -ny / 2 + 1:-1 ]
ggg = @. 2π / nz / δx * [ 0:nz / 2; -nz / 2 + 1:-1 ]
g1, g2, g3 = Matlab.meshgrid(g, gg, ggg)
k2 = @. g1^2 + g2^2 + g3^2

(
Var(η_rex, η_def, sumetasqu, f_def_force),
Params{Float64}(iter_times, dim, M, v_gb, δx,δt, l_gb,σ_gb, k2, f_def)
)
end

#= 主要的计算函数 =#
help_kernel1(η1::T, η2::T,
sumetasqu::T, f_def_force::T,
p::Params
) where {T <: Real} =
-4/3 * p.M / p.l_gb *
(6 * p.σ_gb/p.l_gb * (η1^3 - η1 + 3*η1*η2^2) -
2 * η1 * η2^2 / sumetasqu^2 * f_def_force)

"仅修改 tmp，其余变量不变"
help_kernel1!(tmp::cmA3{T},
η1::Arr3{T}, η2::Arr3{T},
sumetasqu::Arr3{T}, f_def_force::Arr3{T},
p::Params{T},
) where {T<:Real} =
@. tmp = help_kernel1(η1, η2, sumetasqu, f_def_force, p)

help_kernel2(tmp::T, cη1::T, cη2::T, k2::Real, p::Params) where {T <: Complex} =
cη1 + p.δt * (-p.M * p.σ_gb * k2 * cη2 + tmp)

"仅修改 cη1，其余变量不变"
help_kernel2!(cη1::cmA3{T},
cη2::cmA3{T}, tmp::cmA3{T},
p::Params,
) where {T<:Real} =
@. cη1 = help_kernel2(tmp, cη1, cη2, p.k2, p)

"复制 src 的实数部分到 des"
copyre!(des::Arr3{T}, src::cmA3{T}) where {T<:Real} =
map!(real, des, src)
clearim!(arr::cmA3{T}) where {T<:Real} =
@. arr = complex(real(arr))

"更新 η1，tmp 用作临时变量"
function help_kernel0!(::cmA3{T},
η1::TmpVar{T}, η2::TmpVar{T},
v::Var{T}, p::Params{T},
) where {T<:Real}
# 先计算 fft 的参数，存放在 tmp.fft。然后计算 fft
help_kernel1!(v.tmp.fft, η1.re, η2.re, v.sumetasqu.re, v.f_def_force.re, p)
fft!(v.tmp.fft)

# 先计算函数参数，存放在 η1.fft 中
# 然后计算 ifft
# 最后将 η1.fft 实部复制到 η1.re
# 此时使用
help_kernel2!(η1.fft, η2.fft, v.tmp.fft, p)
ifft!(η1.fft)
copyre!(η1.re, η1.fft) # 保证 fft/re 实部相同
clearim!(η1.fft)
end

function help_kernel0!(::cuA3{T},
η1::TmpVar{T}, η2::TmpVar{T},
v::Var{T}, p::Params{T},
) where {T<:Real}
# 先计算 fft 的参数，存放在 tmp.fft。然后计算 fft
help_kernel1!(v.tmp.fft, η1.re, η2.re, v.sumetasqu.re, v.f_def_force.re, p)
copyto!(v.tmp.cu, v.tmp.fft)
fft!(v.tmp.cu)
copyto!(v.tmp.fft, v.tmp.cu)

# 先计算函数参数，存放在 η1.fft 中
# 然后计算 ifft
# 最后将 η1.fft 实部复制到 η1.re
# 此时使用
help_kernel2!(η1.fft, η2.fft, v.tmp.fft, p)
copyto!(η1.cu, η1.fft)
ifft!(η1.cu)
copyto!(η1.fft, η1.cu)
copyre!(η1.re, η1.fft) # 保证 fft/re 实部相同
clearim!(η1.fft)
copyto!(η1.cu, η1.fft)
end

"for 循环中的主体函数"
function main_kernel!(v::Var, p::Params, i::Int64)
# 更新 rex/def.fft
@static if USE_CUDA
fft!(v.rex.cu)
fft!(v.def.cu)
copyto!(v.rex.fft, v.rex.cu)
copyto!(v.def.fft, v.def.cu)

# 分别更新 rex/def
help_kernel0!(v.tmp.cu, v.rex, v.def, v, p)
help_kernel0!(v.tmp.cu, v.def, v.rex, v, p)
else
# 函数假定此时 rex.fft 与 rex.re 数值相同，仅类型不同
# @assert v.rex.fft==complex(v.rex.re) "[\$i] rex.fft!=complex(rex.re)"
# @assert v.def.fft==complex(v.def.re) "[\$i] rex.fft!=complex(rex.re)"
fft!(v.rex.fft)
fft!(v.def.fft)

# 分别更新 rex/def
help_kernel0!(v.tmp.fft, v.rex, v.def, v, p)
help_kernel0!(v.tmp.fft, v.def, v.rex, v, p)
end

# 更新 sumetasqu/f_def_force
@. v.sumetasqu.re = v.rex.re^2 + v.def.re^2
@. v.f_def_force.re = p.f_def * (v.def.re^2 / v.sumetasqu.re)
end

"主函数"
function main(iter_times = 100, dim = (512, 512, 1))
v, p = init_params(iter_times, dim)

for i in 1:p.iter_times
main_kernel!(v, p, i)
end

v
end

# 运行主函数
@btime main(100, (1024, 1024, 1))
# main()
``````