感谢您的回复。根据你的提示,我修改的t_a函数,(我同时修改了t_s和t_a的逻辑,以便更清晰地检查输出)
function t_s(i::Int)
#length_I = round(Int, 3 + rand())
length_I = 3
o1 = fill(i, length_I)
o2 = fill(i^2, length_I)
o3 = fill(i^3, length_I)
sleep(2.5e-3)
return o1, o2, o3
end
function t_a(x::Int)
o1 = zeros(Int, 4x)
o2 = zeros(Int, 4x)
o3 = zeros(Int, 4x)
m::Int = 0
for i in 1:x
o1s, o2s, o3s = t_s(i)
length_I = length(o1s)
o1[m+1:m+length_I] = o1s
o2[m+1:m+length_I] = o2s
o3[m+1:m+length_I] = o3s
m += length_I
end
o1 = o1[1:m]
o2 = o2[1:m]
o3 = o3[1:m]
return o1, o2, o3
end
上面是原问题,下面的t_a1函数是根据您的提示写的:
function t_a1(x::Int)
o1 = zeros(Int, 4, x)
o2 = zeros(Int, 4, x)
o3 = zeros(Int, 4, x)
@threads for i in 1:x
o1s, o2s, o3s = t_s(i)
length_I = length(o1s)
o1[1:length_I, i] = o1s
o2[1:length_I, i] = o2s
o3[1:length_I, i] = o3s
end
nz_idx = findall(i -> i != 0, o1)
return o1[nz_idx], o2[nz_idx], o3[nz_idx]
end
下面是测试:
julia> using Base.Threads, BenchmarkTools
julia> nthreads()
8
julia> @btime t_a(3)
12.221 ms (42 allocations: 1.88 KiB)
([1, 1, 1, 2, 2, 2, 3, 3, 3], [1, 1, 1, 4, 4, 4, 9, 9, 9], [1, 1, 1, 8, 8, 8, 27, 27, 27])
julia> @btime t_a1(3)
3.426 ms (89 allocations: 6.61 KiB)
([1, 1, 1, 2, 2, 2, 3, 3, 3], [1, 1, 1, 4, 4, 4, 9, 9, 9], [1, 1, 1, 8, 8, 8, 27, 27, 27])
可以看到确实完成任务,并且有加速。
但我觉得,不把o1、o2、o3初始化成矩阵是不是也可以啊?即:
function t_a2(x::Int)
o1 = zeros(Int, 4x)
o2 = zeros(Int, 4x)
o3 = zeros(Int, 4x)
m = Atomic{Int}(0)
@threads for i in 1:x
o1s, o2s, o3s = t_s(i)
length_I = length(o1s)
old_m = atomic_add!(m, length_I)
o1[old_m+1:old_m+length_I] = o1s
o2[old_m+1:old_m+length_I] = o2s
o3[old_m+1:old_m+length_I] = o3s
end
return o1[1:m[]], o2[1:m[]], o3[1:m[]]
end
julia> @btime t_a2(3)
3.297 ms (85 allocations: 6.27 KiB)
([1, 1, 1, 2, 2, 2, 3, 3, 3], [1, 1, 1, 4, 4, 4, 9, 9, 9], [1, 1, 1, 8, 8, 8, 27, 27, 27])
好像也完成了任务。