一个类型稳定的优化问题

magicly · 2021 年4 月 2 日 10:58

最近在优化一个问题，今天看到 https://www.cs.purdue.edu/homes/hnassar/JPUG/performance.html ，跑了一下里面的例子，结果发现一个出乎意外的事情，就是两个方法生成的@code_native都一样，居然性能差别挺大，来论坛求助一下。

(base) ➜  mcts git:(master) ✗ julia
               _
   _       _ _(_)_     |  Documentation: https://docs.julialang.org
  (_)     | (_) (_)    |
   _ _   _| |_  __ _   |  Type "?" for help, "]?" for Pkg help.
  | | | | | | |/ _` |  |
  | | |_| | | | (_| |  |  Version 1.6.0 (2021-03-24)
 _/ |\__'_|_|_|\__'_|  |  Official https://julialang.org/ release
|__/                   |

julia> include("src/test-perf.jl")
volume(c1) == volume(c2) == volume(c3) = true
  17.612 ns (1 allocation: 16 bytes)
  6.010 ns (1 allocation: 16 bytes)
  12.974 ns (1 allocation: 16 bytes)
1.7160000000000002

julia> @code_typed volume(c1)
CodeInfo(
1 ─ %1 = Base.getfield(c, :length)::Any
│   %2 = Base.getfield(c, :width)::Any
│   %3 = Base.getfield(c, :height)::Any
│   %4 = (%1 * %2 * %3)::Any
└──      return %4
) => Any

julia> @code_typed volume(c2)
CodeInfo(
1 ─ %1 = Base.getfield(c, :length)::Float64
│   %2 = Base.getfield(c, :width)::Float64
│   %3 = Base.getfield(c, :height)::Float64
│   %4 = Base.mul_float(%1, %2)::Float64
│   %5 = Base.mul_float(%4, %3)::Float64
└──      return %5
) => Float64

julia> @code_typed volume(c3)
CodeInfo(
1 ─ %1 = Base.getfield(c, :length)::Float64
│   %2 = Base.getfield(c, :width)::Float64
│   %3 = Base.getfield(c, :height)::Float64
│   %4 = Base.mul_float(%1, %2)::Float64
│   %5 = Base.mul_float(%4, %3)::Float64
└──      return %5
) => Float64

julia> @code_native volume(c1)
        .text
; ┌ @ test-perf.jl:6 within `volume'
        pushq   %rbp
        movq    %rsp, %rbp
        pushq   %rbx
        andq    $-32, %rsp
        subq    $96, %rsp
        vxorps  %xmm0, %xmm0, %xmm0
        vmovaps %ymm0, (%rsp)
        movq    $0, 32(%rsp)
        movq    %rsi, 64(%rsp)
        movq    %fs:0, %rbx
        movq    $12, (%rsp)
        movq    -32768(%rbx), %rax
        movq    %rax, 8(%rsp)
        movq    %rsp, %rax
        movq    %rax, -32768(%rbx)
        movq    (%rsi), %rax
; │┌ @ Base.jl:33 within `getproperty'
        movq    (%rax), %rcx
        movq    8(%rax), %rdx
        movq    16(%rax), %rax
        movq    %rcx, 32(%rsp)
        movq    %rdx, 24(%rsp)
        movq    %rax, 16(%rsp)
; │└
        movq    %rcx, 40(%rsp)
        movq    %rdx, 48(%rsp)
        movq    %rax, 56(%rsp)
        movabsq $jl_apply_generic, %rax
        movabsq $jl_system_image_data, %rdi
        leaq    40(%rsp), %rsi
        movl    $3, %edx
        vzeroupper
        callq   *%rax
        movq    8(%rsp), %rcx
        movq    %rcx, -32768(%rbx)
        leaq    -8(%rbp), %rsp
        popq    %rbx
        popq    %rbp
        retq
; └

julia> @code_native volume(c2)
        .text
; ┌ @ test-perf.jl:13 within `volume'
; │┌ @ Base.jl:33 within `getproperty'
        vmovsd  (%rdi), %xmm0                   # xmm0 = mem[0],zero
; │└
; │┌ @ operators.jl:560 within `*' @ float.jl:332
        vmulsd  8(%rdi), %xmm0, %xmm0
        vmulsd  16(%rdi), %xmm0, %xmm0
; │└
        retq
        nop
; └

julia> @code_native volume(c3)
        .text
; ┌ @ test-perf.jl:20 within `volume'
; │┌ @ Base.jl:33 within `getproperty'
        vmovsd  (%rdi), %xmm0                   # xmm0 = mem[0],zero
; │└
; │┌ @ operators.jl:560 within `*' @ float.jl:332
        vmulsd  8(%rdi), %xmm0, %xmm0
        vmulsd  16(%rdi), %xmm0, %xmm0
; │└
        retq
        nop
; └

julia>

test-perf.jl代码如下:

mutable struct Cube
    length
    width
    height
end
volume(c::Cube) = c.length*c.width*c.height

mutable struct Cube_typed
    length::Float64
    width::Float64
    height::Float64
end
volume(c::Cube_typed) = c.length*c.width*c.height

mutable struct Cube_parametric_typed{T <: Real}
    length::T
    width::T
    height::T
end
volume(c::Cube_parametric_typed) = c.length*c.width*c.height

c1 = Cube(1.1,1.2,1.3)
c2 = Cube_typed(1.1,1.2,1.3)
c3 = Cube_parametric_typed(1.1,1.2,1.3)
@show volume(c1) == volume(c2) == volume(c3)

using BenchmarkTools
@btime volume(c1) # not typed
@btime volume(c2) # typed float
@btime volume(c3) # typed parametric

我感觉@btime volume(c2) # typed float和@btime volume(c3) # typed parametric应该一样的啊，结果差挺多的。然后我想说用@code_native看看到底有啥不一样，结果发现完全一样的。。。。。

Eggiverse · 2021 年4 月 2 日 15:34

用 BenchmarkTools 时，全局变量要用 interpolation，即 @btime volume($c1)，否则结果不可靠，你再试试吧

johnnychen94 · 2021 年4 月 2 日 18:58

具体的解释可以在这里找到：Performance Tips · The Julia Language

简单来说，因为类型不明确，所以创建这个结构体的时候内存分配的效率变低了。性能不是差在计算上，而是差在 b2 和 b3 的创建上。如果你用 @btime volumn($c2) 来测试的话，结果应该是一致的。

magicly · 2021 年4 月 6 日 02:28

懂了，谢谢两位

magicly · 2021 年4 月 6 日 03:09

看了之后，我感觉跟

因为类型不明确，所以创建这个结构体的时候内存分配的效率变低了

这个没关系，因为Cube_parametric_typed其实创建的时候是知道类型为Float64的，从下面代码也可以看出，其实生成的代码是完全一样的

julia> @code_native Cube_parametric_typed(1.1, 1.2, 1.3)
        .text
; ┌ @ test-perf.jl:16 within `Cube_parametric_typed'
        subq    $24, %rsp
        vmovsd  %xmm2, 16(%rsp)
        vmovsd  %xmm1, 8(%rsp)
        vmovsd  %xmm0, (%rsp)
        movq    %fs:0, %rdi
        addq    $-32768, %rdi                   # imm = 0x8000
; │ @ test-perf.jl:16 within `Cube_parametric_typed' @ test-perf.jl:16
        movabsq $140420672357305, %rax          # imm = 0x7FB63C4B83B9
        movl    $1424, %esi                     # imm = 0x590
        movl    $32, %edx
        callq   *%rax
        movabsq $140420348236400, %rcx          # imm = 0x7FB628F9D270
        movq    %rcx, -8(%rax)
        vmovsd  (%rsp), %xmm0                   # xmm0 = mem[0],zero
        vmovsd  %xmm0, (%rax)
        vmovsd  8(%rsp), %xmm0                  # xmm0 = mem[0],zero
        vmovsd  %xmm0, 8(%rax)
        vmovsd  16(%rsp), %xmm0                 # xmm0 = mem[0],zero
        vmovsd  %xmm0, 16(%rax)
; │ @ test-perf.jl:16 within `Cube_parametric_typed'
        addq    $24, %rsp
        retq
        nopl    (%rax)

julia> @code_native Cube_typed(1.1, 1.2, 1.3)
        .text
; ┌ @ test-perf.jl:9 within `Cube_typed'
        subq    $24, %rsp
        vmovsd  %xmm2, 16(%rsp)
        vmovsd  %xmm1, 8(%rsp)
        vmovsd  %xmm0, (%rsp)
        movq    %fs:0, %rdi
        addq    $-32768, %rdi                   # imm = 0x8000
        movabsq $140420672357305, %rax          # imm = 0x7FB63C4B83B9
        movl    $1424, %esi                     # imm = 0x590
        movl    $32, %edx
        callq   *%rax
        movabsq $140420348057424, %rcx          # imm = 0x7FB628F71750
        movq    %rcx, -8(%rax)
        vmovsd  (%rsp), %xmm0                   # xmm0 = mem[0],zero
        vmovsd  %xmm0, (%rax)
        vmovsd  8(%rsp), %xmm0                  # xmm0 = mem[0],zero
        vmovsd  %xmm0, 8(%rax)
        vmovsd  16(%rsp), %xmm0                 # xmm0 = mem[0],zero
        vmovsd  %xmm0, 16(%rax)
        addq    $24, %rsp
        retq
        nopl    (%rax)

@code_native Cube(1.1, 1.2, 1.3)就长很多，这里就不贴了。

实际上问题根源就是 @Eggiverse 说的

用 BenchmarkTools 时，全局变量要用 interpolation，即 @btime volume($c1)，否则结果不可靠，

我如果把三个变量都标为const 或者 btime中引用变量加上$就都没有问题了：

julia> include("src/test-perf.jl")
volume(c1) == volume(c2) == volume(c3) = true
  15.808 ns (1 allocation: 16 bytes)
  1.151 ns (0 allocations: 0 bytes)
  1.349 ns (0 allocations: 0 bytes)
1.7160000000000002