# 标量优先似乎要求挺高？

``````# 准备初始相空间坐标
zvec = randn(10000)*0.12
δvec = randn(10000)*1.06e-3

const restenergy = 0.511e6            # [eV]

nturns = 10001   # 圈数

struct Par
v1::Float64
r::Float64
ϕs::Float64
ϕ2s::Float64
h1::Int64
h::Int64
circum::Float64
centerenergy::Float64
αc::Float64
end

function revolution(z, δ, p::Par)::Tuple{Float64,Float64}
δ = δ + p.v1 / p.centerenergy * (sin(p.ϕs - 2 * p.h1 * π / p.circum * z ) - sin(p.ϕs) + p.r * sin(p.ϕ2s - 2 * p.h1 * p.h * π / p.circum * z) - p.r * sin(p.ϕ2s))
η = p.αc - 1 / (p.centerenergy * (1 + δ) / restenergy)^2
z = z - η * δ * p.circum
z, δ
end

function revolution(z, δ, k1, k2, c1, c2, c3, p)::Tuple{Float64,Float64}
δ = δ + c1 * (sin(p.ϕs - k1 * z ) + p.r * sin(p.ϕ2s - k2 * z) - c2)
η = p.αc - c3 / (1+δ)^2
z = z - η * δ * p.circum
z, δ
end

function main!(zvec::Vector{Float64}, δvec::Vector{Float64}, nturns, p::Par)
k1 = 2 * p.h1 * π / p.circum
k2 = 2 * p.h1 * p.h * π / p.circum
γ = p.centerenergy/restenergy
c1 = p.v1 / p.centerenergy
c2 = sin(p.ϕs) + p.r * sin(p.ϕ2s)
c3 = 1/γ^2
@inbounds for i = 1:size(zvec, 1)
z, δ = zvec[i], δvec[i]
@inbounds for j = 1:nturns
z, δ = revolution(z, δ, k1, k2, c1, c2, c3, p)   # 这个减少了参数的重复计算
# z, δ = revolution(z, δ, p)    # 这个参数重复计算很费时间
end
zvec[i], δvec[i] = z, δ
end
end

p = Par(3.6395053705870048e+06, 0.1801760487622639, 2.1575815005097385, 6.0620746573056303, 756, 3, 1360.4, 6e9, 0.0000156)
@btime main!(zvec, δvec, nturns, p)
``````

``````revolution(z, δ, p)
``````

``````  8.505 s (0 allocations: 0 bytes)

8.486 s (0 allocations: 0 bytes)

8.603 s (0 allocations: 0 bytes)
``````

``````revolution(z, δ, k1, k2, c1, c2, c3, p)
``````

``````  4.777 s (0 allocations: 0 bytes)

4.880 s (0 allocations: 0 bytes)

4.889 s (0 allocations: 0 bytes)

4.887 s (0 allocations: 0 bytes)
``````

``````julia> @code_warntype main!(zvec, δvec, nturns, p)
Variables
#self#::Core.Const(main!)
zvec::Vector{Float64}
δvec::Vector{Float64}
nturns::Int64
p::Par
@_6::Union{Nothing, Tuple{Int64, Int64}}    # 看这儿
ηvec::Vector{Float64}
i::Int64

Body::Nothing
# 以下忽略
``````

1 个赞

``````│   %29 = Main.size(zvec, 1)::Int64
│   %30 = (1:%29)::Core.PartialStruct(UnitRange{Int64}, Any[Core.Const(1), Int64])
│         (@_6 = Base.iterate(%30))
│   %32 = (@_6 === nothing)::Bool
│   %33 = Base.not_int(%32)::Bool
└──       goto #7 if not %33
``````
1 个赞

``````function main1!(zvec::Vector{Float64}, δvec::Vector{Float64}, nturns, p::Par)
ηvec = zeros(size(zvec, 1))
for i = 1:nturns
δvec .= δvec .+ p.v1 / p.centerenergy .* (sin.(p.ϕs .- 2 * p.h1 * π / p.circum .* zvec) .- sin(p.ϕs) .+ p.r .* sin.(p.ϕ2s .- 2 * p.h1 * p.h * π / p.circum .* zvec) .- p.r * sin(p.ϕ2s))
ηvec .= p.αc .- 1 ./ (p.centerenergy/restenergy .* (1 .+ δvec)) .^ 2
zvec .= zvec .- p.circum .* ηvec .* δvec
end
end

function main2!(zvec::Vector{Float64}, δvec::Vector{Float64}, nturns, p::Par)
k1 = 2 * p.h1 * π / p.circum
k2 = 2 * p.h1 * p.h * π / p.circum
c1 = p.v1 / p.centerenergy
c2 = sin(p.ϕs) + p.r * sin(p.ϕ2s)
c3 = 1 / (p.centerenergy / restenergy)^2

ηvec = zeros(size(zvec, 1))
for i = 1:nturns
δvec .= δvec .+ c1 .* (sin.(p.ϕs .- k1 .* zvec) .+ p.r .* sin.(p.ϕ2s .- k2 .* zvec) .- c2)
ηvec .= p.αc .- c3 ./ (1 .+ δvec) .^ 2
zvec .= zvec .- p.circum .* ηvec .* δvec
end
end
``````

``````# 第一个
1.958 s (2 allocations: 78.17 KiB)
1.956 s (2 allocations: 78.17 KiB)
1.959 s (2 allocations: 78.17 KiB)
1.958 s (2 allocations: 78.17 KiB)
1.961 s (2 allocations: 78.17 KiB)
# 第二个看起来更快的实际更慢
1.989 s (2 allocations: 78.17 KiB)
1.983 s (2 allocations: 78.17 KiB)
1.982 s (2 allocations: 78.17 KiB)
1.993 s (2 allocations: 78.17 KiB)
1.988 s (2 allocations: 78.17 KiB)
``````

``````@inbounds for j = 1:nturns
@inbounds for i = 1:size(zvec, 1)
zvec[i], δvec[i] = revolution(zvec[i], δvec[i], k1, k2, c1, c2, c3, p)   # 这个减少了参数的重复计算
end
end
``````

`@_6::Union{Nothing, Tuple{Int64, Int64}}`

Julia 的 for 循环采用的迭代器协议是 `iterate(x)::Union{Nothing, T}` 的形式，其中返回 `Nothing` 表示迭代结束（类似于 Python 的 `StopIteration`）。对于这种类型可能性有限，并且可能的类型都是具体类型的情况，Julia 有所谓的 union-splitting 技术来优化它，所以你看到的警告颜色是黄色（也就是不太需要关心）

2 个赞

``````function _revolution_cache(p)
k1 = 2 * p.h1 * π / p.circum
k2 = 2 * p.h1 * p.h * π / p.circum
γ = p.centerenergy/restenergy
c1 = p.v1 / p.centerenergy
c2 = sin(p.ϕs) + p.r * sin(p.ϕ2s)
c3 = 1/γ^2
return (; k1, k2, c1, c2, c3)
end

function revolution(z, δ, p::Par, c)::Tuple{Float64,Float64}
# c is the precomputed intermediate result that can be reused

# δ = δ + p.v1 / p.centerenergy * (sin(p.ϕs - 2 * p.h1 * π / p.circum * z ) - sin(p.ϕs) + p.r * sin(p.ϕ2s - 2 * p.h1 * p.h * π / p.circum * z) - p.r * sin(p.ϕ2s))
δ = δ + c.c1 * (sin(p.ϕs - c.k1 * z ) + p.r * sin(p.ϕ2s - c.k2 * z) - c.c2)
# η = p.αc - 1 / (p.centerenergy * (1 + δ) / restenergy)^2
η = p.αc - c.c3 / (1+δ)^2
z = z - η * δ * p.circum
z, δ
end
``````

`Base.sin/cos/exp` 默认都是不内联，因此无法向量化。

2 个赞

``````function main1!(zvec::Vector{Float64}, δvec::Vector{Float64}, nturns, p::Par)
...略...
@inbounds for i = 1:size(zvec, 1)
z, δ = zvec[i], δvec[i]
@inbounds for j = 1:nturns
z, δ = revolution(z, δ, k1, k2, c1, c2, c3, p)
end
zvec[i], δvec[i] = z, δ
end
end

function main2!(zvec::Vector{Float64}, δvec::Vector{Float64}, nturns, p::Par)
...略...
@inbounds for j = 1:nturns
@inbounds for i = 1:size(zvec, 1)
zvec[i], δvec[i] = revolution(zvec[i], δvec[i], k1, k2, c1, c2, c3, p)
end
end
end

julia> @time main1!(zvec, δvec, nturns, p)
3.056780 seconds (371.13 k allocations: 19.812 MiB, 0.60% gc time, 3.34% compilation time)
2.963028 seconds
2.976086 seconds
2.973847 seconds
2.975546 seconds

julia> @time main2!(zvec, δvec, nturns, p)
2.165210 seconds (44.41 k allocations: 2.447 MiB, 0.63% compilation time)
2.160676 seconds
2.139684 seconds
2.137427 seconds
``````

========================================

``````julia> sizeof(zvec)
80000
``````

`(; k1, k2, c1, c2, c3)`

1 个赞

``````julia> a = randn(1024); b = sin.(a);

julia> import Tools.MyFastFloat: SLEEF

julia> a = randn(1024); b = sin.(a);

julia> @btime \$b .= sin.(\$a);
9.000 μs (0 allocations: 0 bytes)

julia> @btime \$b .= SLEEF.sin.(\$a);
29.900 μs (0 allocations: 0 bytes)

julia> @inline xsin(x) = @inline SLEEF.sin(x);

julia> @btime \$b .= xsin.(\$a);
7.300 μs (0 allocations: 0 bytes)

julia> @inline xsin_fast(x) = @inline SLEEF.sin_fast(x);

julia> @btime \$b .= xsin_fast.(\$a);
2.311 μs (0 allocations: 0 bytes)
``````

1 个赞

3 个赞

1 个赞