using BenchmarkTools
function discrete_integrate(xvec::Vector, y1vec::Vector, y2vec::Vector, y3vec::Vector)
s = zero(promote_type(eltype(xvec), eltype(y1vec), eltype(y2vec), eltype(y3vec)))
for i in 1:length(xvec)-1
s += (y1vec[i+1] * y2vec[i+1] * y3vec[i+1] + y1vec[i] * y2vec[i] * y3vec[i]) * (xvec[i+1] - xvec[i])
end
0.5 * s
end
function discrete_integrate(xvec::Vector, y1vec, y2vec, y3vec)
s = zero(promote_type(eltype(xvec), eltype(y1vec), eltype(y2vec), eltype(y3vec)))
for i in 1:length(xvec)-1
s += (y1vec[i+1] * y2vec[i+1] * y3vec[i+1] + y1vec[i] * y2vec[i] * y3vec[i]) * (xvec[i+1] - xvec[i])
end
0.5 * s
end
function funF2(matSlωJ::Array{ComplexF64,3}, Zvec::Vector, ωvec::Vector, nJ::Int64, lm::Int64)
resmat = zeros(ComplexF64, nJ, nJ, 2lm + 1, 2lm + 1)
@sync for l1 = -lm:lm, l2 = -lm:lm
idx1 = lm + l1 + 1
idx2 = lm + l2 + 1
Threads.@spawn for j1 = 1:nJ, j2 = 1:nJ
resmat[j1, j2, idx1, idx2] = discrete_integrate(ωvec, Zvec, conj.(matSlωJ[idx1, :, j1]), matSlωJ[idx2, :, j2])
end
end
resmat
end
function funF3(matSlωJ::Array{ComplexF64,3}, Zvec::Vector, ωvec::Vector, nJ::Int64, lm::Int64)
resmat = zeros(ComplexF64, nJ, nJ, 2lm + 1, 2lm + 1)
nω = length(ωvec)
@sync for l1 = -lm:lm, l2 = -lm:lm
idx1 = lm + l1 + 1
idx2 = lm + l2 + 1
Threads.@spawn for j1 = 1:nJ, j2 = 1:nJ
s = zero(ComplexF64)
for i in 1:(nω-1)
tmpy1 = Zvec[i] * conj.(matSlωJ[idx1, i, j1]) * matSlωJ[idx2, i, j2]
tmpy2 = Zvec[i+1] * conj.(matSlωJ[idx1, i+1, j1]) * matSlωJ[idx2, i+1, j2]
s += (tmpy1 + tmpy2) * (ωvec[i+1] - ωvec[i])
end
resmat[j1, j2, idx1, idx2] = 0.5 * s
end
end
resmat
end
function funF4(matSlωJ::Array{ComplexF64,3}, Zvec::Vector, ωvec::Vector, nJ::Int64, lm::Int64)
resmat = zeros(ComplexF64, nJ, nJ, 2lm + 1, 2lm + 1)
@sync for l1 = -lm:lm, l2 = -lm:lm
idx1 = lm + l1 + 1
idx2 = lm + l2 + 1
Threads.@spawn for j1 = 1:nJ, j2 = 1:nJ
@views resmat[j1, j2, idx1, idx2] = discrete_integrate(ωvec, Zvec, conj.(matSlωJ[idx1, :, j1]), matSlωJ[idx2, :, j2])
end
end
resmat
end
matS = rand(ComplexF64, 5, 1000, 60);
impedance = rand(1000);
ωvec = collect(range(0.0, step=0.01, length=1000));
# @benchmark funF2($matS, $impedance, $ωvec, 60, 2)
# @benchmark funF3($matS, $impedance, $ωvec, 60, 2)
# @benchmark funF4($matS, $impedance, $ωvec, 60, 2) # 除了F3性能最好
这里funF2
性能最差,因为循环过程一直在复制切片——分配内存操作用了27万次;
然后funF3
性能最好,因为我对数组元素进行计算,没有复制切片操作——内存分配168次;
最后funF4
性能中等,即使我用了@views
内存分配也有9万次。
我的问题是:funF4
的内存分配来自于谁?如何避免?
运行结果:
julia> @benchmark funF2($matS, $impedance, $ωvec, 60, 2)
BenchmarkTools.Trial: 5 samples with 1 evaluation.
Range (min … max): 1.024 s … 1.137 s ┊ GC (min … max): 22.47% … 25.52%
Time (median): 1.044 s ┊ GC (median): 24.32%
Time (mean ± σ): 1.070 s ± 48.693 ms ┊ GC (mean ± σ): 24.88% ± 1.95%
█ █ █ █ █
█▁▁▁▁▁▁▁█▁█▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁█▁▁▁▁▁▁▁▁▁▁▁▁▁▁█ ▁
1.02 s Histogram: frequency by time 1.14 s <
Memory estimate: 4.06 GiB, allocs estimate: 270171.
julia> @benchmark funF3($matS, $impedance, $ωvec, 60, 2)
BenchmarkTools.Trial: 51 samples with 1 evaluation.
Range (min … max): 86.419 ms … 116.561 ms ┊ GC (min … max): 0.00% … 0.00%
Time (median): 99.776 ms ┊ GC (median): 0.00%
Time (mean ± σ): 98.790 ms ± 6.074 ms ┊ GC (mean ± σ): 0.09% ± 0.60%
▃▃█
▄▁▇▄▄▅▁▄▁▁▁▁▁▁▁▁▁▁▁▁▄▁▁▁▁▁▁▁▁▄▇███▅▅▄▄▇▁▄▅▄▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▄▄ ▁
86.4 ms Histogram: frequency by time 111 ms <
Memory estimate: 1.39 MiB, allocs estimate: 168.
julia> @benchmark funF4($matS, $impedance, $ωvec, 60, 2)
BenchmarkTools.Trial: 13 samples with 1 evaluation.
Range (min … max): 349.539 ms … 434.925 ms ┊ GC (min … max): 24.48% … 28.21%
Time (median): 392.088 ms ┊ GC (median): 24.99%
Time (mean ± σ): 386.421 ms ± 27.450 ms ┊ GC (mean ± σ): 25.38% ± 1.86%
█ ▁ ▁ ▁ ▁ ▁ ▁█ ▁ ▁ ▁
█▁█▁▁▁▁█▁▁▁▁▁▁▁▁▁▁▁▁▁█▁█▁▁▁▁▁▁█▁██▁▁▁▁▁▁▁▁▁▁█▁▁▁▁▁▁█▁▁▁▁▁▁▁▁█ ▁
350 ms Histogram: frequency by time 435 ms <
Memory estimate: 1.35 GiB, allocs estimate: 90169.