# Julia内存分配随着循环的进行越来越大

• `collision` 函数里面没分配过内存。尽量做到这样。
• `streaming` 是内存分配的大头，主要问题是 `circshift`。有点复杂暂时没思路。
• `boundaryCondition`
• `macroQuantity` 使用 inplace 操作避免内存分配。
主要问题在以下几句
``````# mem allo
288016000 ρ = sum(f, dims = 3)
288004000 u0 = copy(u)
288004000 v0 = copy(v)

288004000 uv = @. sqrt(u^2 + v^2)
``````

## inplace 操作

### `macroQuantity()`

`ρ = sum(f, dims = 3)`；参数类型 `f::Array{Float64,3}, ρ::Array{Float64,2}`
`sum(f, dims = 3)` 会返回一个 `10×10×1 Array{Int64,3}:` 导致 `ρ` 类型不稳定

``````f = rand(Int, 10,10,8);
r1 = sum(f, dims = 3);
r2 = rand(Int, 10,10);

typeof(r1)  # Array{Int64,3}
typeof(r2)  # Array{Int64,2}
typeof(r1[:,:,1])   # Array{Int64,2}
r1[:,:,1] == r2     # true
``````
• `sum` => `sum!`
• `copy` => `copy!`
• `uv = @. sqrt(u^2 + v^2)` => `@. uv = sqrt(u^2 + v^2)`

diff

``````# ρ = sum(f, dims = 3)
# u0 = copy(u)
# v0 = copy(v)
sum!(ρ, f)
copy!(u0, u)
copy!(v0, v)

# uv = @. sqrt(u^2 + v^2)
@. uv = sqrt(u^2 + v^2)
# @. uv[:] = sqrt(u[:]^2 + v[:]^2) #  9853 allocations: 5.23 GiB
``````

“15.818 s (9449 allocations: 4.70 GiB)”

2 个赞

`boundaryCondition()`

`@.` `@views` 的组合测试

view.jl
``````
const nx = 1200
const ny = 600
const Q = 8
f = rand(Int, nx, ny, Q);

function r0!(f) # original
@views f[:, ny, 4] = f[:, ny, 2]
@views f[:, ny, 8] = f[:, ny, 6]
@views f[:, ny, 7] = f[:, ny, 5]
end # 7.767 μs (9 allocations: 28.88 KiB)

function r1!(f) # nothing
f[:, ny, 4] = f[:, ny, 2]
f[:, ny, 8] = f[:, ny, 6]
f[:, ny, 7] = f[:, ny, 5]
end # 6.140 μs (3 allocations: 28.50 KiB)

function r2!(f) # dot macro `@.` only
@. f[:, ny, 4] = f[:, ny, 2]
@. f[:, ny, 8] = f[:, ny, 6]
@. f[:, ny, 7] = f[:, ny, 5]
end # 3.600 μs (6 allocations: 28.69 KiB)

function r3!(f) # dot only
f[:, ny, 4] .= f[:, ny, 2]
f[:, ny, 8] .= f[:, ny, 6]
f[:, ny, 7] .= f[:, ny, 5]
end # 3.686 μs (6 allocations: 28.69 KiB)

# @. @views
function r4!(f) # @. @views
@. @views f[:, ny, 4] = f[:, ny, 2]
@. @views f[:, ny, 8] = f[:, ny, 6]
@. @views f[:, ny, 7] = f[:, ny, 5]
end # 1.230 μs (6 allocations: 384 bytes)

function r5!(f) # @views @.
@views @. f[:, ny, 4] = f[:, ny, 2]
@views @. f[:, ny, 8] = f[:, ny, 6]
@views @. f[:, ny, 7] = f[:, ny, 5]
end # 1.210 μs (6 allocations: 384 bytes)

@views f[:, ny, 4] .= f[:, ny, 2]
@views f[:, ny, 8] .= f[:, ny, 6]
@views f[:, ny, 7] .= f[:, ny, 5]
end # 1.230 μs (6 allocations: 384 bytes)

copy!(f[:, ny, 4], f[:, ny, 2])
copy!(f[:, ny, 8], f[:, ny, 6])
copy!(f[:, ny, 7], f[:, ny, 5])
end # 6.750 μs (6 allocations: 57.00 KiB)

# julia> @btime r0!(\$f);
#   7.767 μs (9 allocations: 28.88 KiB)
# julia> @btime r1!(\$f);
#   6.140 μs (3 allocations: 28.50 KiB)
# julia> @btime r2!(\$f);
#   3.600 μs (6 allocations: 28.69 KiB)
# julia> @btime r3!(\$f);
#   3.686 μs (6 allocations: 28.69 KiB)
# julia> @btime r4!(\$f);
#   1.230 μs (6 allocations: 384 bytes)
# julia> @btime r5!(\$f);
#   1.210 μs (6 allocations: 384 bytes)
# julia> @btime r6!(\$f);
#   1.230 μs (6 allocations: 384 bytes)
# julia> @btime r7!(\$f);
#   6.750 μs (6 allocations: 57.00 KiB)
``````
diff
``````#left
# @views f[1, 2:ny-1, 1] = f[1, 2:ny-1, 3] + ρ[1, 2:ny-1] .* U * 2.0 / 3.0
# @views f[1, 2:ny-1, 5] = f[1, 2:ny-1, 7] - 0.5 .* (f[1, 2:ny-1, 2] - f[1, 2:ny-1, 4]) + ρ[1, 2:ny-1] .* U / 6.0
# @views f[1, 2:ny-1, 8] = f[1, 2:ny-1, 6] + 0.5 .* (f[1, 2:ny-1, 2] - f[1, 2:ny-1, 4]) + ρ[1, 2:ny-1] .* U / 6.0
@views @. f[1, 2:ny-1, 1] = f[1, 2:ny-1, 3] + ρ[1, 2:ny-1] * U * 2.0 / 3.0
@views @. f[1, 2:ny-1, 5] = f[1, 2:ny-1, 7] - 0.5 * (f[1, 2:ny-1, 2] - f[1, 2:ny-1, 4]) + ρ[1, 2:ny-1] * U / 6.0
@views @. f[1, 2:ny-1, 8] = f[1, 2:ny-1, 6] + 0.5 * (f[1, 2:ny-1, 2] - f[1, 2:ny-1, 4]) + ρ[1, 2:ny-1] * U / 6.0

#right
# @views f[nx, :, 3] = f[nx-1, :, 3]
# @views f[nx, :, 6] = f[nx-1, :, 6]
# @views f[nx, :, 7] = f[nx-1, :, 7]
@views @. f[nx, :, 3] = f[nx-1, :, 3]
@views @. f[nx, :, 6] = f[nx-1, :, 6]
@views @. f[nx, :, 7] = f[nx-1, :, 7]

#bottom
# @views f[:, 1, 2] = f[:, 1, 4]
# @views f[:, 1, 5] = f[:, 1, 7]
# @views f[:, 1, 6] = f[:, 1, 8]
@views @. f[:, 1, 2] = f[:, 1, 4]
@views @. f[:, 1, 5] = f[:, 1, 7]
@views @. f[:, 1, 6] = f[:, 1, 8]

#top
# @views f[:, ny, 4] = f[:, ny, 2]
# @views f[:, ny, 8] = f[:, ny, 6]
# @views f[:, ny, 7] = f[:, ny, 5]
@views @. f[:, ny, 4] = f[:, ny, 2]
@views @. f[:, ny, 8] = f[:, ny, 6]
@views @. f[:, ny, 7] = f[:, ny, 5]
``````
mem
``````        - # function boundaryCondition(nx, ny, u, v, ρ, U, f)
- function boundaryCondition(nx::Int64, ny::Int64,
-     u::Array{Float64,2}, v::Array{Float64,2}, ρ::Array{Float64,2}, U::Float64, f::Array{Float64,3}
- )
- #left
- # @views f[1, 2:ny-1, 1] = f[1, 2:ny-1, 3] + ρ[1, 2:ny-1] .* U * 2.0 / 3.0
- # @views f[1, 2:ny-1, 5] = f[1, 2:ny-1, 7] - 0.5 .* (f[1, 2:ny-1, 2] - f[1, 2:ny-1, 4]) + ρ[1, 2:ny-1] .* U / 6.0
- # @views f[1, 2:ny-1, 8] = f[1, 2:ny-1, 6] + 0.5 .* (f[1, 2:ny-1, 2] - f[1, 2:ny-1, 4]) + ρ[1, 2:ny-1] .* U / 6.0
9600 @views @. f[1, 2:ny-1, 1] = f[1, 2:ny-1, 3] + ρ[1, 2:ny-1] * U * 2.0 / 3.0
16000 @views @. f[1, 2:ny-1, 5] = f[1, 2:ny-1, 7] - 0.5 * (f[1, 2:ny-1, 2] - f[1, 2:ny-1, 4]) + ρ[1, 2:ny-1] * U / 6.0
16000 @views @. f[1, 2:ny-1, 8] = f[1, 2:ny-1, 6] + 0.5 * (f[1, 2:ny-1, 2] - f[1, 2:ny-1, 4]) + ρ[1, 2:ny-1] * U / 6.0
0 @views u[1, 2:ny-1] .= U
0 @views v[1, 2:ny-1] .= 0.0
-
- #right
- # @views f[nx, :, 3] = f[nx-1, :, 3]
- # @views f[nx, :, 6] = f[nx-1, :, 6]
- # @views f[nx, :, 7] = f[nx-1, :, 7]
6400 @views @. f[nx, :, 3] = f[nx-1, :, 3]
6400 @views @. f[nx, :, 6] = f[nx-1, :, 6]
6400 @views @. f[nx, :, 7] = f[nx-1, :, 7]
-
- #bottom
- # @views f[:, 1, 2] = f[:, 1, 4]
- # @views f[:, 1, 5] = f[:, 1, 7]
- # @views f[:, 1, 6] = f[:, 1, 8]
6400 @views @. f[:, 1, 2] = f[:, 1, 4]
6400 @views @. f[:, 1, 5] = f[:, 1, 7]
6400 @views @. f[:, 1, 6] = f[:, 1, 8]
0 @views u[:, 1] .= 0.0
0 @views v[:, 1] .= 0.0
-
- #top
- # @views f[:, ny, 4] = f[:, ny, 2]
- # @views f[:, ny, 8] = f[:, ny, 6]
- # @views f[:, ny, 7] = f[:, ny, 5]
6400 @views @. f[:, ny, 4] = f[:, ny, 2]
6400 @views @. f[:, ny, 8] = f[:, ny, 6]
6400 @views @. f[:, ny, 7] = f[:, ny, 5]
0 @views u[:, ny] .= 0.0
2400 @views v[:, ny] .= 0.0
- end
``````

“8.972 s (8302 allocations: 4.69 GiB)”

1 个赞

`mainFlow` 里计算相对误差的部分拿出来成为一个函数就地改写。

mem

``````        0 for iterStep = 1:iterStepMax
0     collision(nx, ny, Q, u, v, ρ, cx, cy, w, ω, f, feq)
0     streaming(Q, cx, cy, f)
0     boundaryCondition(nx, ny, u, v, ρ, U, f)
0     macroQuantity(nx, ny, Q, u, v, uv, u0, v0, ρ, cx, cy, f)
-     # temp1 = sum((u[:, :] - u0[:, :]) .^ 2 + (v[:, :] - v0[:, :]) .^ 2)
1440020000     temp1 = sum((u - u0) .^ 2 + (v - v0) .^ 2)
864012000     temp2 = sum(u .^ 2 + v .^ 2)
0     err = sqrt(temp1) / sqrt(temp2 + 1e-30)
0     if err <= ε
0         break
-     end
- end

0 for k = 1:Q
2617296789     @views f[:, :, k] .= circshift(f[:, :, k], [cx[k], cy[k]])
- end
``````

diff

``````# 在 mainFlow 内，第一个循环外预分配内存
rel_err_temp = ones(Float64, nx, ny)

# for iterStep = 1:iterStepMax 循环内
# temp1 = sum((u - u0) .^ 2 + (v - v0) .^ 2)
# temp2 = sum(u .^ 2 + v .^ 2)
# err = sqrt(temp1) / sqrt(temp2 + 1e-30)
@. rel_err_temp = (u - u0)^2 + (v - v0)^2
temp1 = sum(rel_err_temp)
@. rel_err_temp = u^2 + v^2
temp2 = sum(rel_err_temp)

err = sqrt(temp1 / (temp2 + 1e-30))
``````

“7.671 s (7504 allocations: 2.55 GiB)”

mem
``````        - function mainFlow(iterStepMax::Int64)
0 Q = 9 #D2Q9
- nx = 1200 #lattice number in horizontal direction
- ny = 600 #lattice number in vertical direction
- U = 0.1 #inlet velocity
160 cx = [1.0 0.0 -1.0 0.0 1.0 -1.0 -1.0 1.0 0.0] #velocity components in horizontal direction
160 cy = [0.0 1.0 0.0 -1.0 1.0 1.0 -1.0 -1.0 0.0] #velocity components in vertical direction
160 w = [1.0 / 9.0 1.0 / 9.0 1.0 / 9.0 1.0 / 9.0 1.0 / 36.0 1.0 / 36.0 1.0 / 36.0 1.0 / 36.0 4.0 / 9.0] #weight parameters
- # cx = [1.0, 0.0, -1.0, 0.0, 1.0, -1.0, -1.0, 1.0, 0.0] #velocity components in horizontal direction
- # cy = [0.0, 1.0, 0.0, -1.0, 1.0, 1.0, -1.0, -1.0, 0.0] #velocity components in vertical direction
- # w = [1.0 / 9.0, 1.0 / 9.0, 1.0 / 9.0, 1.0 / 9.0, 1.0 / 36.0, 1.0 / 36.0, 1.0 / 36.0, 1.0 / 36.0, 4.0 / 9.0] #weight parameters
-
5760080 rel_err_temp = Array{Float64}(undef, nx, ny)
5760080 ρ = ones(Float64, nx, ny) #initial density
5760080 u = zeros(Float64, nx, ny) #horizontal velocity component
5760080 v = zeros(Float64, nx, ny) #vertical velocity component
5760080 u0 = zeros(Float64, nx, ny) #horizontal velocity component before iteration
5760080 v0 = zeros(Float64, nx, ny) #vertical velocity component befre iteration
- # u0 = Array{Float64}(undef, nx, ny)
- # v0 = Array{Float64}(undef, nx, ny)
5760080 uv = zeros(Float64, nx, ny) #resultant velocity
51840080 f = zeros(Float64, nx, ny, Q) #distribution function
51840080 feq = zeros(Float64, nx, ny, Q) #equilibrium distribution function
-
- dx = 1.0 #horizontal lattice length
- dy = 1.0 #vertical lattice length
- lx = dx * nx #horizontal domain length
- ly = dy * ny #vertical domain length
- dt = dx #dt = 1.0
- c = dx / dt #lattice sound speed
- Re = 1000.0 #Reynoldz number
- ν = U * ly / Re #kinematic viscousity
- τ = 3.0ν + 0.5 #relaxation time
- ω = 1.0 / τ #relaxation frequency
4792960 println("Re = ", Re, ", τ = ", τ) #instead of "Re = \$Re, τ = \$τ"
- # iterStepMax = 10000
- ε = 1.0e-6 #convergence requirement
-
0 @views u[1, 2:ny-1] .= U
0 for iterStep = 1:iterStepMax
0     collision(nx, ny, Q, u, v, ρ, cx, cy, w, ω, f, feq)
0     streaming(Q, cx, cy, f)
0     boundaryCondition(nx, ny, u, v, ρ, U, f)
0     macroQuantity(nx, ny, Q, u, v, uv, u0, v0, ρ, cx, cy, f)
-
-     # temp1 = sum((u[:, :] - u0[:, :]) .^ 2 + (v[:, :] - v0[:, :]) .^ 2)
-     # temp1 = sum((u - u0) .^ 2 + (v - v0) .^ 2)
-     # temp2 = sum(u .^ 2 + v .^ 2)
-     # err = sqrt(temp1) / sqrt(temp2 + 1e-30)
0     @. rel_err_temp = (u - u0)^2 + (v - v0)^2
0     temp1 = sum(rel_err_temp)
0     @. rel_err_temp = u^2 + v^2
0     temp2 = sum(rel_err_temp)
-
0     err = sqrt(temp1 / (temp2 + 1e-30))
0     if err <= ε
0         break
-     end
- end
0     nx, ny, u, v
- end
``````
1 个赞

2 个赞

circshift 类型不稳定及内存分配问题的解决

7.060 s (7503 allocations: 2.55 GiB)
6.694 s (3507 allocations: 2.55 GiB)

``````# cx = [1.0 0.0 -1.0 0.0 1.0 -1.0 -1.0 1.0 0.0] #velocity components in horizontal direction
# cy = [0.0 1.0 0.0 -1.0 1.0 1.0 -1.0 -1.0 0.0] #velocity components in vertical direction
cx = [1 0 -1  0 1 -1 -1  1 0] #velocity components in horizontal direction
cy = [0 1  0 -1 1  1 -1 -1 0] #velocity components in vertical direction

# @views f[:, :, k] .= circshift(f[:, :, k], [cx[k], cy[k]])
@views f[:, :, k] .= circshift(f[:, :, k], (cx[k], cy[k]))
``````

julia 的 `circshift` 实现有点问题 #25003

9.204 s (10858 allocations: 7.38 GiB)
7.060 s (7503 allocations: 2.55 GiB) 修复类型不稳定
6.694 s (3507 allocations: 2.55 GiB) 换用其他 `circshift` 实现
5.533 s (3003 allocations: 137.50 MiB)

``````julia> @benchmark mainFlow(50)
BenchmarkTools.Trial:
memory estimate:  137.50 MiB
allocs estimate:  3051
--------------
minimum time:     5.292 s (0.02% GC)
median time:      5.292 s (0.02% GC)
mean time:        5.292 s (0.02% GC)
maximum time:     5.292 s (0.02% GC)
--------------
samples:          1
evals/sample:     1
``````

2763v3.0.jl
``````using ShiftedArrays

function mainFlow(iterStepMax::Int64)
Q = 9 #D2Q9
nx = 1200 #lattice number in horizontal direction
ny = 600 #lattice number in vertical direction
U = 0.1 #inlet velocity
# cx = [1.0 0.0 -1.0 0.0 1.0 -1.0 -1.0 1.0 0.0] #velocity components in horizontal direction
# cy = [0.0 1.0 0.0 -1.0 1.0 1.0 -1.0 -1.0 0.0] #velocity components in vertical direction
cx = [1 0 -1  0 1 -1 -1  1 0] #velocity components in horizontal direction
cy = [0 1  0 -1 1  1 -1 -1 0] #velocity components in vertical direction

w = [1.0 / 9.0 1.0 / 9.0 1.0 / 9.0 1.0 / 9.0 1.0 / 36.0 1.0 / 36.0 1.0 / 36.0 1.0 / 36.0 4.0 / 9.0] #weight parameters

# rel_err_temp = Array{Float64}(undef, nx, ny)
rel_err_temp = ones(Float64, nx, ny)
ρ = ones(Float64, nx, ny) #initial density
u = zeros(Float64, nx, ny) #horizontal velocity component
v = zeros(Float64, nx, ny) #vertical velocity component
u0 = zeros(Float64, nx, ny) #horizontal velocity component before iteration
v0 = zeros(Float64, nx, ny) #vertical velocity component befre iteration
# u0 = Array{Float64}(undef, nx, ny)
# v0 = Array{Float64}(undef, nx, ny)
uv = zeros(Float64, nx, ny) #resultant velocity
f = zeros(Float64, nx, ny, Q) #distribution function
feq = zeros(Float64, nx, ny, Q) #equilibrium distribution function

dx = 1.0 #horizontal lattice length
dy = 1.0 #vertical lattice length
lx = dx * nx #horizontal domain length
ly = dy * ny #vertical domain length
dt = dx #dt = 1.0
c = dx / dt #lattice sound speed
Re = 1000.0 #Reynoldz number
ν = U * ly / Re #kinematic viscousity
τ = 3.0ν + 0.5 #relaxation time
ω = 1.0 / τ #relaxation frequency
println("Re = ", Re, ", τ = ", τ) #instead of "Re = \$Re, τ = \$τ"
# iterStepMax = 10000
ε = 1.0e-6 #convergence requirement

@views u[1, 2:ny-1] .= U
for iterStep = 1:iterStepMax
collision(nx, ny, Q, u, v, ρ, cx, cy, w, ω, f, feq)
streaming(Q, cx, cy, f)
boundaryCondition(nx, ny, u, v, ρ, U, f)
macroQuantity(nx, ny, Q, u, v, uv, u0, v0, ρ, cx, cy, f)

# temp1 = sum((u[:, :] - u0[:, :]) .^ 2 + (v[:, :] - v0[:, :]) .^ 2)
# temp2 = sum(u .^ 2 + v .^ 2)
# err = sqrt(temp1) / sqrt(temp2 + 1e-30)
@. rel_err_temp = (u - u0)^2 + (v - v0)^2
temp1 = sum(rel_err_temp)
@. rel_err_temp = u^2 + v^2
temp2 = sum(rel_err_temp)

err = sqrt(temp1 / (temp2 + 1e-30))
if err <= ε
break
end
end
end

function collision(nx, ny, Q, u, v, ρ, cx, cy, w, ω, f, feq)
for j = 1:ny, i = 1:nx
t1 = u[i, j] * u[i, j] + v[i, j] * v[i, j]
for k = 1:Q
t2 = u[i, j] * cx[k] + v[i, j] * cy[k]
feq[i, j, k] = ρ[i, j] * w[k] * (1.0 + 3.0t2 + 4.5 * t2^2 - 1.5t1)
f[i, j, k] = (1.0 - ω) * f[i, j, k] + ω * feq[i, j, k]
end
end
end

function streaming(Q, cx, cy, f)
for k = 1:Q
# @views f[:, :, k] .= circshift(f[:, :, k], [cx[k], cy[k]])
# @views f[:, :, k] .= circshift(f[:, :, k], (cx[k], cy[k]))
@views f[:, :, k] .= ShiftedArrays.circshift(f[:, :, k], (cx[k], cy[k]))
end
end

function boundaryCondition(nx, ny, u, v, ρ, U, f)
#left
# @views f[1, 2:ny-1, 1] = f[1, 2:ny-1, 3] + ρ[1, 2:ny-1] .* U * 2.0 / 3.0
# @views f[1, 2:ny-1, 5] = f[1, 2:ny-1, 7] - 0.5 .* (f[1, 2:ny-1, 2] - f[1, 2:ny-1, 4]) + ρ[1, 2:ny-1] .* U / 6.0
# @views f[1, 2:ny-1, 8] = f[1, 2:ny-1, 6] + 0.5 .* (f[1, 2:ny-1, 2] - f[1, 2:ny-1, 4]) + ρ[1, 2:ny-1] .* U / 6.0
@views @. f[1, 2:ny-1, 1] = f[1, 2:ny-1, 3] + ρ[1, 2:ny-1] * U * 2.0 / 3.0
@views @. f[1, 2:ny-1, 5] = f[1, 2:ny-1, 7] - 0.5 * (f[1, 2:ny-1, 2] - f[1, 2:ny-1, 4]) + ρ[1, 2:ny-1] * U / 6.0
@views @. f[1, 2:ny-1, 8] = f[1, 2:ny-1, 6] + 0.5 * (f[1, 2:ny-1, 2] - f[1, 2:ny-1, 4]) + ρ[1, 2:ny-1] * U / 6.0
@views u[1, 2:ny-1] .= U
@views v[1, 2:ny-1] .= 0.0

#right
# @views f[nx, :, 3] = f[nx-1, :, 3]
# @views f[nx, :, 6] = f[nx-1, :, 6]
# @views f[nx, :, 7] = f[nx-1, :, 7]
@views @. f[nx, :, 3] = f[nx-1, :, 3]
@views @. f[nx, :, 6] = f[nx-1, :, 6]
@views @. f[nx, :, 7] = f[nx-1, :, 7]

#bottom
# @views f[:, 1, 2] = f[:, 1, 4]
# @views f[:, 1, 5] = f[:, 1, 7]
# @views f[:, 1, 6] = f[:, 1, 8]
@views @. f[:, 1, 2] = f[:, 1, 4]
@views @. f[:, 1, 5] = f[:, 1, 7]
@views @. f[:, 1, 6] = f[:, 1, 8]
@views u[:, 1] .= 0.0
@views v[:, 1] .= 0.0

#top
# @views f[:, ny, 4] = f[:, ny, 2]
# @views f[:, ny, 8] = f[:, ny, 6]
# @views f[:, ny, 7] = f[:, ny, 5]
@views @. f[:, ny, 4] = f[:, ny, 2]
@views @. f[:, ny, 8] = f[:, ny, 6]
@views @. f[:, ny, 7] = f[:, ny, 5]
@views u[:, ny] .= 0.0
@views v[:, ny] .= 0.0
end

function macroQuantity(nx, ny, Q, u, v, uv, u0, v0, ρ, cx, cy, f)
# ρ[:, :] = sum(f, dims = 3)
# u0[:, :] = u[:, :]
# v0[:, :] = v[:, :]
sum!(ρ, f)
copy!(u0, u)
copy!(v0, v)

for j = 1:ny, i = 1:nx
uSum = 0.0
vSum = 0.0
for k = 1:Q
uSum += f[i, j, k] * cx[k]
vSum += f[i, j, k] * cy[k]
end
u[i, j] = uSum / ρ[i, j]
v[i, j] = vSum / ρ[i, j]
end
# uv[:, :] = @. sqrt(u[:, :]^2 + v[:, :]^2)
@. uv = sqrt(u^2 + v^2)
end

# mainFlow(50)
using BenchmarkTools
@btime mainFlow(50)
``````

.mem
``````        - using ShiftedArrays
-
- function mainFlow(iterStepMax::Int64)
444211806 Q = 9 #D2Q9
- nx = 1200 #lattice number in horizontal direction
- ny = 600 #lattice number in vertical direction
- U = 0.1 #inlet velocity
- # cx = [1.0 0.0 -1.0 0.0 1.0 -1.0 -1.0 1.0 0.0] #velocity components in horizontal direction
- # cy = [0.0 1.0 0.0 -1.0 1.0 1.0 -1.0 -1.0 0.0] #velocity components in vertical direction
160 cx = [1 0 -1  0 1 -1 -1  1 0] #velocity components in horizontal direction
160 cy = [0 1  0 -1 1  1 -1 -1 0] #velocity components in vertical direction
-
160 w = [1.0 / 9.0 1.0 / 9.0 1.0 / 9.0 1.0 / 9.0 1.0 / 36.0 1.0 / 36.0 1.0 / 36.0 1.0 / 36.0 4.0 / 9.0] #weight parameters
-
- # rel_err_temp = Array{Float64}(undef, nx, ny)
5760080 rel_err_temp = ones(Float64, nx, ny)
5760080 ρ = ones(Float64, nx, ny) #initial density
5760080 u = zeros(Float64, nx, ny) #horizontal velocity component
5760080 v = zeros(Float64, nx, ny) #vertical velocity component
5760080 u0 = zeros(Float64, nx, ny) #horizontal velocity component before iteration
5760080 v0 = zeros(Float64, nx, ny) #vertical velocity component befre iteration
- # u0 = Array{Float64}(undef, nx, ny)
- # v0 = Array{Float64}(undef, nx, ny)
5760080 uv = zeros(Float64, nx, ny) #resultant velocity
51840080 f = zeros(Float64, nx, ny, Q) #distribution function
51840080 feq = zeros(Float64, nx, ny, Q) #equilibrium distribution function
-
- dx = 1.0 #horizontal lattice length
- dy = 1.0 #vertical lattice length
- lx = dx * nx #horizontal domain length
- ly = dy * ny #vertical domain length
- dt = dx #dt = 1.0
- c = dx / dt #lattice sound speed
- Re = 1000.0 #Reynoldz number
- ν = U * ly / Re #kinematic viscousity
- τ = 3.0ν + 0.5 #relaxation time
- ω = 1.0 / τ #relaxation frequency
4792848 println("Re = ", Re, ", τ = ", τ) #instead of "Re = \$Re, τ = \$τ"
- # iterStepMax = 10000
- ε = 1.0e-6 #convergence requirement
-
0 @views u[1, 2:ny-1] .= U
0 for iterStep = 1:iterStepMax
0     collision(nx, ny, Q, u, v, ρ, cx, cy, w, ω, f, feq)
0     streaming(Q, cx, cy, f)
0     boundaryCondition(nx, ny, u, v, ρ, U, f)
0     macroQuantity(nx, ny, Q, u, v, uv, u0, v0, ρ, cx, cy, f)
-
-     # temp1 = sum((u[:, :] - u0[:, :]) .^ 2 + (v[:, :] - v0[:, :]) .^ 2)
-     # temp2 = sum(u .^ 2 + v .^ 2)
-     # err = sqrt(temp1) / sqrt(temp2 + 1e-30)
0     @. rel_err_temp = (u - u0)^2 + (v - v0)^2
0     temp1 = sum(rel_err_temp)
0     @. rel_err_temp = u^2 + v^2
0     temp2 = sum(rel_err_temp)
-
0     err = sqrt(temp1 / (temp2 + 1e-30))
0     if err <= ε
0         break
-     end
- end
-
- end
-
- function collision(nx, ny, Q, u, v, ρ, cx, cy, w, ω, f, feq)
0 for j = 1:ny, i = 1:nx
0     t1 = u[i, j] * u[i, j] + v[i, j] * v[i, j]
0     for k = 1:Q
0         t2 = u[i, j] * cx[k] + v[i, j] * cy[k]
0         feq[i, j, k] = ρ[i, j] * w[k] * (1.0 + 3.0t2 + 4.5 * t2^2 - 1.5t1)
0         f[i, j, k] = (1.0 - ω) * f[i, j, k] + ω * feq[i, j, k]
-     end
- end
- end
-
- function streaming(Q, cx, cy, f)
0 for k = 1:Q
-     # @views f[:, :, k] .= circshift(f[:, :, k], [cx[k], cy[k]])
-     # @views f[:, :, k] .= circshift(f[:, :, k], (cx[k], cy[k]))
72000     @views f[:, :, k] .= ShiftedArrays.circshift(f[:, :, k], (cx[k], cy[k]))
- end
- end
-
- function boundaryCondition(nx, ny, u, v, ρ, U, f)
- #left
- # @views f[1, 2:ny-1, 1] = f[1, 2:ny-1, 3] + ρ[1, 2:ny-1] .* U * 2.0 / 3.0
- # @views f[1, 2:ny-1, 5] = f[1, 2:ny-1, 7] - 0.5 .* (f[1, 2:ny-1, 2] - f[1, 2:ny-1, 4]) + ρ[1, 2:ny-1] .* U / 6.0
- # @views f[1, 2:ny-1, 8] = f[1, 2:ny-1, 6] + 0.5 .* (f[1, 2:ny-1, 2] - f[1, 2:ny-1, 4]) + ρ[1, 2:ny-1] .* U / 6.0
9600 @views @. f[1, 2:ny-1, 1] = f[1, 2:ny-1, 3] + ρ[1, 2:ny-1] * U * 2.0 / 3.0
16000 @views @. f[1, 2:ny-1, 5] = f[1, 2:ny-1, 7] - 0.5 * (f[1, 2:ny-1, 2] - f[1, 2:ny-1, 4]) + ρ[1, 2:ny-1] * U / 6.0
16000 @views @. f[1, 2:ny-1, 8] = f[1, 2:ny-1, 6] + 0.5 * (f[1, 2:ny-1, 2] - f[1, 2:ny-1, 4]) + ρ[1, 2:ny-1] * U / 6.0
0 @views u[1, 2:ny-1] .= U
0 @views v[1, 2:ny-1] .= 0.0
-
- #right
- # @views f[nx, :, 3] = f[nx-1, :, 3]
- # @views f[nx, :, 6] = f[nx-1, :, 6]
- # @views f[nx, :, 7] = f[nx-1, :, 7]
6400 @views @. f[nx, :, 3] = f[nx-1, :, 3]
6400 @views @. f[nx, :, 6] = f[nx-1, :, 6]
6400 @views @. f[nx, :, 7] = f[nx-1, :, 7]
-
- #bottom
- # @views f[:, 1, 2] = f[:, 1, 4]
- # @views f[:, 1, 5] = f[:, 1, 7]
- # @views f[:, 1, 6] = f[:, 1, 8]
6400 @views @. f[:, 1, 2] = f[:, 1, 4]
6400 @views @. f[:, 1, 5] = f[:, 1, 7]
6400 @views @. f[:, 1, 6] = f[:, 1, 8]
0 @views u[:, 1] .= 0.0
0 @views v[:, 1] .= 0.0
-
- #top
- # @views f[:, ny, 4] = f[:, ny, 2]
- # @views f[:, ny, 8] = f[:, ny, 6]
- # @views f[:, ny, 7] = f[:, ny, 5]
6400 @views @. f[:, ny, 4] = f[:, ny, 2]
6400 @views @. f[:, ny, 8] = f[:, ny, 6]
6400 @views @. f[:, ny, 7] = f[:, ny, 5]
0 @views u[:, ny] .= 0.0
2400 @views v[:, ny] .= 0.0
- end
-
-
- function macroQuantity(nx, ny, Q, u, v, uv, u0, v0, ρ, cx, cy, f)
- # ρ[:, :] = sum(f, dims = 3)
- # u0[:, :] = u[:, :]
- # v0[:, :] = v[:, :]
0 sum!(ρ, f)
0 copy!(u0, u)
0 copy!(v0, v)
-
0 for j = 1:ny, i = 1:nx
-     uSum = 0.0
-     vSum = 0.0
0     for k = 1:Q
0         uSum += f[i, j, k] * cx[k]
0         vSum += f[i, j, k] * cy[k]
-     end
0     u[i, j] = uSum / ρ[i, j]
0     v[i, j] = vSum / ρ[i, j]
- end
- # uv[:, :] = @. sqrt(u[:, :]^2 + v[:, :]^2)
0 @. uv = sqrt(u^2 + v^2)
- end
-
- mainFlow(50)

``````
2 个赞

``````function streaming(Q, cx, cy, f, F)
copy!(F, f)
for k = 1:Q
@views f[:, :, k] = ShiftedArrays.circshift(F[:, :, k], (cx[k], cy[k]))
end
end
``````

View目前会在heap上分配，把这个在batch dimension上的view展开成loop会快一些。就能把这部分allocation去掉。

3 个赞

``````using ShiftedArrays, Plots

function mainFlow(iterStepMax::Int64)
Q = 9 #D2Q9
nx = 1200 #lattice number in horizontal direction
ny = 600 #lattice number in vertical direction
U = 0.1 #inlet velocity
cx = [1 0 -1 0 1 -1 -1 1 0] #velocity components in horizontal direction
cy = [0 1 0 -1 1 1 -1 -1 0] #velocity components in vertical direction
w = [1.0 / 9.0 1.0 / 9.0 1.0 / 9.0 1.0 / 9.0 1.0 / 36.0 1.0 / 36.0 1.0 / 36.0 1.0 / 36.0 4.0 / 9.0] #weight parameters
rel_err_temp = ones(Float64, nx, ny)
ρ = ones(Float64, nx, ny) #initial density
u = zeros(Float64, nx, ny) #horizontal velocity component
v = zeros(Float64, nx, ny) #vertical velocity component
u0 = zeros(Float64, nx, ny) #horizontal velocity component before iteration
v0 = zeros(Float64, nx, ny) #vertical velocity component befre iteration
uv = zeros(Float64, nx, ny) #resultant velocity
f = zeros(Float64, nx, ny, Q) #distribution function
F = zeros(Float64, nx, ny, Q)
feq = zeros(Float64, nx, ny, Q) #equilibrium distribution function
dx = 1.0 #horizontal lattice length
dy = 1.0 #vertical lattice length
lx = dx * nx #horizontal domain length
ly = dy * ny #vertical domain length
dt = dx #dt = 1.0
c = dx / dt #lattice sound speed
Re = 1000.0 #Reynoldz number
ν = U * ly / Re #kinematic viscousity
τ = 3.0ν + 0.5 #relaxation time
ω = 1.0 / τ #relaxation frequency
println("Re = ", Re, ", τ = ", τ) #instead of "Re = \$Re, τ = \$τ"
ε = 1.0e-6 #convergence requirement

@views u[1, 2:ny-1] .= U
for iterStep = 1:iterStepMax
collision(nx, ny, Q, u, v, ρ, cx, cy, w, ω, f, feq)
streaming(Q, cx, cy, f, F)
boundaryCondition(nx, ny, u, v, ρ, U, f)
macroQuantity(nx, ny, Q, u, v, uv, u0, v0, ρ, cx, cy, f)
@. rel_err_temp = (u - u0)^2 + (v - v0)^2
temp1 = sum(rel_err_temp)
@. rel_err_temp = u^2 + v^2
temp2 = sum(rel_err_temp)
err = sqrt(temp1 / (temp2 + 1e-30))
if fld(iterStep, 100) * 100 == iterStep #instead of "floor"
println(
"iterStep = \$iterStep",
", residual = ",
round(err, digits = 8),
", top-left velocity = ",
round(uv[300, 450], digits = 8),
", bottom-left velocity = ",
round(uv[300, 150], digits = 8),
", top-right velocity = ",
round(uv[900, 450], digits = 8),
", bottom-right velocity = ",
round(uv[900, 150], digits = 8),
)
end
if err <= ε
break
end
end
clibrary(:colorcet)
contourf(uv, color = :rainbow, aspect_ratio = 1.0)
end

function collision(nx, ny, Q, u, v, ρ, cx, cy, w, ω, f, feq)
for j = 1:ny, i = 1:nx
t1 = u[i, j] * u[i, j] + v[i, j] * v[i, j]
for k = 1:Q
t2 = u[i, j] * cx[k] + v[i, j] * cy[k]
feq[i, j, k] = ρ[i, j] * w[k] * (1.0 + 3.0t2 + 4.5 * t2^2 - 1.5t1)
f[i, j, k] = (1.0 - ω) * f[i, j, k] + ω * feq[i, j, k]
end
end
end

function streaming(Q, cx, cy, f, F)
copy!(F, f)
for k = 1:Q
@views f[:, :, k] = ShiftedArrays.circshift(F[:, :, k], (cx[k], cy[k]))
end
end

function boundaryCondition(nx, ny, u, v, ρ, U, f)
#left and right
for j = 2:ny-1
f[1, j, 1] = f[1, j, 3] + ρ[1, j] * U * 2.0 / 3.0
f[1, j, 5] = f[1, j, 7] - 0.5 * (f[1, j, 2] - f[1, j, 4]) + ρ[1, j] * U / 6.0
f[1, j, 8] = f[1, j, 6] + 0.5 * (f[1, j, 2] - f[1, j, 4]) + ρ[1, j] * U / 6.0
f[nx, j, 3] = f[nx-1, j, 3]
f[nx, j, 6] = f[nx-1, j, 6]
f[nx, j, 7] = f[nx-1, j, 7]
end
u[1, 2:ny-1] .= U
v[1, 2:ny-1] .= 0.0

#bottom and top
for i = 1:nx
f[i, 1, 2] = f[i, 1, 4]
f[i, 1, 5] = f[i, 1, 7]
f[i, 1, 6] = f[i, 1, 8]
f[i, ny, 4] = f[i, ny, 2]
f[i, ny, 8] = f[i, ny, 6]
f[i, ny, 7] = f[i, ny, 5]
end
u[:, 1] .= 0.0
v[:, 1] .= 0.0
u[:, ny] .= 0.0
v[:, ny] .= 0.0
end

function macroQuantity(nx, ny, Q, u, v, uv, u0, v0, ρ, cx, cy, f)
sum!(ρ, f)
copy!(u0, u)
copy!(v0, v)
for j = 1:ny, i = 1:nx
uSum = 0.0
vSum = 0.0
for k = 1:Q
uSum += f[i, j, k] * cx[k]
vSum += f[i, j, k] * cy[k]
end
u[i, j] = uSum / ρ[i, j]
v[i, j] = vSum / ρ[i, j]
end
@. uv = sqrt(u^2 + v^2)
end
``````

.mem文件

``````        - using ShiftedArrays, Plots
-
- function mainFlow(iterStepMax::Int64)
211946304     Q = 9 #D2Q9
-     nx = 1200 #lattice number in horizontal direction
-     ny = 600 #lattice number in vertical direction
-     U = 0.1 #inlet velocity
160     cx = [1 0 -1 0 1 -1 -1 1 0] #velocity components in horizontal direction
160     cy = [0 1 0 -1 1 1 -1 -1 0] #velocity components in vertical direction
160     w = [1.0 / 9.0 1.0 / 9.0 1.0 / 9.0 1.0 / 9.0 1.0 / 36.0 1.0 / 36.0 1.0 / 36.0 1.0 / 36.0 4.0 / 9.0] #weight parameters
5760080     rel_err_temp = ones(Float64, nx, ny)
5760080     ρ = ones(Float64, nx, ny) #initial density
5760080     u = zeros(Float64, nx, ny) #horizontal velocity component
5760080     v = zeros(Float64, nx, ny) #vertical velocity component
5760080     u0 = zeros(Float64, nx, ny) #horizontal velocity component before iteration
5760080     v0 = zeros(Float64, nx, ny) #vertical velocity component befre iteration
5760080     uv = zeros(Float64, nx, ny) #resultant velocity
51840080     f = zeros(Float64, nx, ny, Q) #distribution function
51840080     F = zeros(Float64, nx, ny, Q)
51840080     feq = zeros(Float64, nx, ny, Q) #equilibrium distribution function
-     dx = 1.0 #horizontal lattice length
-     dy = 1.0 #vertical lattice length
-     lx = dx * nx #horizontal domain length
-     ly = dy * ny #vertical domain length
-     dt = dx #dt = 1.0
-     c = dx / dt #lattice sound speed
-     Re = 1000.0 #Reynoldz number
-     ν = U * ly / Re #kinematic viscousity
-     τ = 3.0ν + 0.5 #relaxation time
-     ω = 1.0 / τ #relaxation frequency
4941680     println("Re = ", Re, ", τ = ", τ) #instead of "Re = \$Re, τ = \$τ"
-     ε = 1.0e-6 #convergence requirement
-
0     @views u[1, 2:ny-1] .= U
0     for iterStep = 1:iterStepMax
0         collision(nx, ny, Q, u, v, ρ, cx, cy, w, ω, f, feq)
0         streaming(Q, cx, cy, f, F)
0         boundaryCondition(nx, ny, u, v, ρ, U, f)
0         macroQuantity(nx, ny, Q, u, v, uv, u0, v0, ρ, cx, cy, f)
0         @. rel_err_temp = (u - u0)^2 + (v - v0)^2
0         temp1 = sum(rel_err_temp)
0         @. rel_err_temp = u^2 + v^2
0         temp2 = sum(rel_err_temp)
0         err = sqrt(temp1 / (temp2 + 1e-30))
0         if fld(iterStep, 100) * 100 == iterStep #instead of "floor"
0             println(
-                 "iterStep = \$iterStep",
-                 ", residual = ",
-                 round(err, digits = 8),
-                 ", top-left velocity = ",
-                 round(uv[300, 450], digits = 8),
-                 ", bottom-left velocity = ",
-                 round(uv[300, 150], digits = 8),
-                 ", top-right velocity = ",
-                 round(uv[900, 450], digits = 8),
-                 ", bottom-right velocity = ",
-                 round(uv[900, 150], digits = 8),
-             )
-         end
0         if err <= ε
0             break
-         end
-     end
0     clibrary(:colorcet)
64     contourf(uv, color = :rainbow, aspect_ratio = 1.0)
- end
-
- function collision(nx, ny, Q, u, v, ρ, cx, cy, w, ω, f, feq)
0     for j = 1:ny, i = 1:nx
0         t1 = u[i, j] * u[i, j] + v[i, j] * v[i, j]
0         for k = 1:Q
0             t2 = u[i, j] * cx[k] + v[i, j] * cy[k]
0             feq[i, j, k] = ρ[i, j] * w[k] * (1.0 + 3.0t2 + 4.5 * t2^2 - 1.5t1)
0             f[i, j, k] = (1.0 - ω) * f[i, j, k] + ω * feq[i, j, k]
-         end
-     end
- end
-
- function streaming(Q, cx, cy, f, F)
0     copy!(F, f)
0     for k = 1:Q
43200         @views f[:, :, k] = ShiftedArrays.circshift(F[:, :, k], (cx[k], cy[k]))
-     end
- end
-
- function boundaryCondition(nx, ny, u, v, ρ, U, f)
-     #left and right
0     for j = 2:ny-1
0         f[1, j, 1] = f[1, j, 3] + ρ[1, j] * U * 2.0 / 3.0
0         f[1, j, 5] = f[1, j, 7] - 0.5 * (f[1, j, 2] - f[1, j, 4]) + ρ[1, j] * U / 6.0
0         f[1, j, 8] = f[1, j, 6] + 0.5 * (f[1, j, 2] - f[1, j, 4]) + ρ[1, j] * U / 6.0
0         f[nx, j, 3] = f[nx-1, j, 3]
0         f[nx, j, 6] = f[nx-1, j, 6]
0         f[nx, j, 7] = f[nx-1, j, 7]
-     end
0     u[1, 2:ny-1] .= U
0     v[1, 2:ny-1] .= 0.0
-
-     #bottom and top
0     for i = 1:nx
0         f[i, 1, 2] = f[i, 1, 4]
0         f[i, 1, 5] = f[i, 1, 7]
0         f[i, 1, 6] = f[i, 1, 8]
0         f[i, ny, 4] = f[i, ny, 2]
0         f[i, ny, 8] = f[i, ny, 6]
0         f[i, ny, 7] = f[i, ny, 5]
-     end
0     u[:, 1] .= 0.0
0     v[:, 1] .= 0.0
0     u[:, ny] .= 0.0
2400     v[:, ny] .= 0.0
- end
-
-
- function macroQuantity(nx, ny, Q, u, v, uv, u0, v0, ρ, cx, cy, f)
0     sum!(ρ, f)
0     copy!(u0, u)
0     copy!(v0, v)
0     for j = 1:ny, i = 1:nx
-         uSum = 0.0
-         vSum = 0.0
0         for k = 1:Q
0             uSum += f[i, j, k] * cx[k]
0             vSum += f[i, j, k] * cy[k]
-         end
0         u[i, j] = uSum / ρ[i, j]
0         v[i, j] = vSum / ρ[i, j]
-     end
0     @. uv = sqrt(u^2 + v^2)
- end
-
``````

``````@benchmark mainFlow(50)
BenchmarkTools.Trial:
memory estimate:  197.99 MiB
allocs estimate:  3231
--------------
minimum time:     4.342 s (3.28% GC)
median time:      4.362 s (1.65% GC)
mean time:        4.362 s (1.65% GC)
maximum time:     4.382 s (0.03% GC)
--------------
samples:          2
evals/sample:     1
``````
``````@benchmark mainFlow(100)
BenchmarkTools.Trial:
memory estimate:  198.04 MiB
allocs estimate:  4244
--------------
minimum time:     8.771 s (0.01% GC)
median time:      8.771 s (0.01% GC)
mean time:        8.771 s (0.01% GC)
maximum time:     8.771 s (0.01% GC)
--------------
samples:          1
evals/sample:     1
``````

index的时候要加inbounds。好好读下performance tips再？看看有没有没有遵守的tips

1 个赞

1 个赞

1 个赞

3 个赞

Windows操作系统。我在powershell里面用julia --allocation-track=user设置后，我运行一个程序然后关闭Julia，找不到有.mem文件阿。到底是哪个细节出问题了呢？您能不能给个简单的截图教程阿？相信很多人都会用得到！万分感谢！昨天折腾了一天，心都累了。