求助!Cell List算法写出来没有两重for循环快

问题描述:在二维平面的一系列点中找出所有距离小于给定值s的点对,并输出结果
实现方法:

  1. 两重for循环 for i = 1:NumberNod-1 for j = i+1:NumerNod
  2. Cell List算法: 先将点放进cell中,然后只需要搜索一个邻近的8个cell即可

遇到困难:采用Cell List算法写出的代码反而没有两重for循环快

代码:

fixed-radius nearest neighbor

=============================================================================
#cell list algorithm

function FRNNMain(NumNod::Int64,Dim::Int64,A::Array{Float64,2},s::Float64)
    r = s/sqrt(Dim)
    PoiPair = Array{Tuple{Int64,Int64},1}(undef,1)
    Cell = PoiCell(A,NumNod,r)
    for key in keys(Cell)
        vaule = Cell[key]
        InnerCellPair!(vaule,PoiPair)
        LoopNeighbor!(key,Cell,PoiPair,A,s)
    end
    return PoiPair
end
function PoiCell(A::Matrix{Float64},NumNod::Int64,r::Float64)
    Cell = Dict()
    for i = 1:1:NumNod
        x = Int(floor(A[i,1]/r))
        y = Int(floor(A[i,2]/r))
        z = (x,y)
        if z ∉ keys(Cell)
            Cell[z] = [i]
        else
            push!(Cell[z],i)
        end
    end
    return Cell
end
function GetNeighborCell(key::Tuple{Int64,Int64})
    NeiCell = Array{Tuple}(undef,8,1)
    NeiCell[1] = (key[1]+1,key[2]+1)
    NeiCell[2] = (key[1]+1,key[2])
    NeiCell[3] = (key[1]+1,key[2]-1)
    NeiCell[4] = (key[1],key[2]+1)
    NeiCell[5] = (key[1],key[2]-1)
    NeiCell[6] = (key[1]-1,key[2])
    NeiCell[7] = (key[1]-1,key[2]+1)
    NeiCell[8] = (key[1]-1,key[2]-1)
    return NeiCell
end
function InnerCellPair!(value::Array{Int64},pairs::Array{Tuple{Int64,Int64},1})
    Len = length(value)
    if Len >= 2
        for i = 1:Len-1
            for j = (i+1):Len
                z = (value[i],value[j])
                push!(pairs,z)
            end
        end
    end
    return pairs
end
function LoopNeighbor!(key::Tuple{Int64,Int64},Cell::Dict{Any,Any},PoiPair::Array{Tuple{Int64,Int64},1},A::Array{Float64,2},s::Float64)
    value = Cell[key]
    NeiCell = GetNeighborCell(key)
    for i = 1:8
        key1 = NeiCell[i]
        if key1 ∈ keys(Cell)
            value1 = Cell[NeiCell[i]]
            FindPair!(value,value1,PoiPair,A,s)
        end
    end
    return PoiPair
end
function FindPair!(value,value1,PoiPair::Array{Tuple{Int64,Int64},1},A::Array{Float64,2},s::Float64)
    Len1 = length(value)
    Len2 = length(value1)
    for i = 1:Len1
        for j = 1:Len2
            P1 = value[i]
            P2 = value1[j]
            Point1 = A[P1,:]
            Point2 = A[P2,:]
            Dist = Distance(Point1,Point2)
            if Dist <= s^2
                AddPair!(P1,P2,PoiPair)
            end
        end
    end
    return PoiPair
end
function Distance(Point1::Array{Float64,1},Point2::Array{Float64,1})
    Dist = (Point1[1]-Point2[1])^2+(Point1[2]-Point2[2])^2
    return Dist
end
function AddPair!(P1::Int64,P2::Int64,PoiPair::Array{Tuple{Int64,Int64},1})
    pair1 = (P1,P2)
    pair2 = (P2,P1)
    if pair1 ∉ PoiPair
        if pair2 ∉ PoiPair
            push!(PoiPair,pair1)
        end
    end
    return PoiPair
end
NumNod = 1000
Dim = 2
A = rand(NumNod,Dim)
s = 0.1;
a = FRNNMain(NumNod,Dim,A,s)

=============================================================================#forloop

NumNod = 1000
Dim = 2
A = rand(NumNod,Dim)
s = 0.1;
function forloop(A::Matrix{Float64},NumNod::Int64,s::Float64)
    PoiPair = []
    for i = 1:NumNod-1
        for j = i+1:NumNod
            Point1 = A[i,:]
            Point2 = A[j,:]
            Dist = (Point1[1]-Point2[1])^2+(Point1[2]-Point2[2])^2
            if Dist < s
                push!(PoiPair,(i,j))
            end
        end
    end
    return PoiPair
end
b = forloop(A,NumNod,s)

=============================================================================
运行时间:
@btime FRNNMain(NumNod,Dim,A,s)
142.194 ms (79608 allocations: 7.33 MiB)
@btime forloop(A,NumNod,s)
46.899 ms (1117806 allocations: 97.09 MiB)
刚刚从Matlab转过来,profiler的结果有点看不懂,就不往上贴了
可能存在的问题:在搜索邻近cell的过程中存在着重复

那个,要不代码用"```"包起来?
```julia
println(“hello world”)
```
缩进也调一下

调好啦,第一次发帖,不太熟悉markdown操作,见谅了

说实话,这个我不会
能讲讲你的思路吗,我作死试一试
你写了一堆不知道是什么

r = s/sqrt(2)是cell的尺寸,这样cell里的点的距离就都小于s了
用点的坐标除以r向下取整,得到点所在的cell的左下顶点的坐标,把这个坐标当成key,点编号当成value存到字典里面
循环每一个cell
每一个cell里面的点两两组合形成点对
循环每一个cell周围的8个cell,看这些cell里面有没有点对
结束。
主要是参照这个问题里的回答写的
[How could I utilize algorithms for nearest neighbor search to do fixed radius search? - Stack Overflow]

cell是什么?

@btime FRNNMain(NumNod,Dim,A,s)
142.194 ms (79608 allocations: 7.33 MiB)
@btime forloop(A,NumNod,s)
46.899 ms (1117806 allocations: 97.09 MiB)

这里cell算法用到的内存明显比forloop少很多,速度也比较慢,可能是算法写的有问题,试试重写一遍?

cell就是小格子,相当于把二维平面划分成一个个的小格子,把点放到这些小格子里面。
知乎上的这个问题和回答说的比较清楚

重写是换一种思路写么?

Point1 = A[i,:]
Point2 = A[j,:]
Dist = (Point1[1]-Point2[1])^2+(Point1[2]-Point2[2])^2

这样写 p1, p2 会复制,浪费内存。
改成 (A[i, 1] - A[j, 1])^2 + ... 能把内存占用压到 1MB 左右

剩下的消耗内存多的操作就是 notinin 这种了,说明瓶颈不在内存上。

内存分配计数见 julia --track-allocation cell_list.jl
如果函数体内第一行的计数非常大,那他其实就是函数的总计数。这里有点小 bug。


    Len1 = length(value)
    Len2 = length(value1)
    for i = 1:Len1
        for j = 1:Len2
            P1 = value[i]
            P2 = value1[j]

这里直接迭代就好

for p1 in value
    for p2 in value1

最好是连复制都要避免,从上一层直接传 Cellkey 然后迭代。

你好,按照你说的方法修改程序以后占用的内存确实压缩了很多,但是速度反而更慢了

@btime FRNNMain(NumNod,Dim,A,s)
  201.197 ms (7702 allocations: 761.42 KiB)

请问这是为啥呢

更新:
按照woclass大佬的方法减少内存以后,又按照性能建议手册把floor函数换成fld函数,把矩阵按行访问改成按列访问,把Dict的类型声明了,代码如下:

# fied-radius nearest neighbor
#cell list algorithm
function FRNNMain(NumNod::Int64,Dim::Int64,A::Array{Float64,2},s::Float64)
    r = s/sqrt(Dim)
    PoiPair = Array{Tuple{Int64,Int64},1}(undef,1)
    popfirst!(PoiPair)
    Cell = PoiCell(A,NumNod,r)
    for key in keys(Cell)
        vaule = Cell[key]
        InnerCellPair!(vaule,PoiPair)
        LoopNeighbor!(key,Cell,PoiPair,A,s)
    end
    return PoiPair
end
function PoiCell(A::Matrix{Float64},NumNod::Int64,r::Float64)
    Cell = Dict{Tuple{Int64,Int64},Array{Int64,1}}()
    for i = 1:1:NumNod
        x = fld(A[1,i],r)
        y = fld(A[2,i],r)
        z = (x,y)
        if z ∉ keys(Cell)
            Cell[z] = [i]
        else
            push!(Cell[z],i)
        end
    end
    return Cell
end
function GetNeighborCell(key::Tuple{Int64,Int64})
    NeiCell = Array{Tuple{Int64,Int64}}(undef,8,1)
    NeiCell[1] = (key[1]+1,key[2]+1)
    NeiCell[2] = (key[1]+1,key[2])
    NeiCell[3] = (key[1]+1,key[2]-1)
    NeiCell[4] = (key[1],key[2]+1)
    NeiCell[5] = (key[1],key[2]-1)
    NeiCell[6] = (key[1]-1,key[2])
    NeiCell[7] = (key[1]-1,key[2]+1)
    NeiCell[8] = (key[1]-1,key[2]-1)
    return NeiCell
end
function InnerCellPair!(value::Array{Int64},pairs::Array{Tuple{Int64,Int64},1})
    Len = length(value)::Int64
    if Len >= 2
        for i = 1:Len-1
            for j = (i+1):Len
                z = (value[i],value[j])
                push!(pairs,z)
            end
        end
    end
    return pairs
end
function LoopNeighbor!(key::Tuple{Int64,Int64},Cell::Dict{Tuple{Int64,Int64},Array{Int64,1}},PoiPair::Array{Tuple{Int64,Int64},1},A::Array{Float64,2},s::Float64)
    NeiCell = Array{Tuple{Int64,Int64}}(undef,8,1)
    NeiCell = GetNeighborCell(key)
    for i = 1:8
        if NeiCell[i] ∈ keys(Cell)
            FindPair!(Cell[key],Cell[NeiCell[i]],PoiPair,A,s)
        end
    end
    return PoiPair
end
function FindPair!(value::Array{Int64},value1::Array{Int64},PoiPair::Array{Tuple{Int64,Int64},1},A::Array{Float64,2},s::Float64)
    for P1 in value
        for P2 in value1
            #Dist = Distance(A[P1,:],A[P2,:])
            Dist::Float64 = (A[1,P1]-A[1,P2])^2+(A[2,P1]-A[2,P2])^2
            if Dist <= s^2
                AddPair!(P1,P2,PoiPair)
            end
        end
    end
    return PoiPair
end
function Distance(Point1::Array{Float64,1},Point2::Array{Float64,1})
    Dist = (Point1[1]-Point2[1])^2+(Point1[2]-Point2[2])^2::Float64
    return Dist
end
function AddPair!(P1::Int64,P2::Int64,PoiPair::Array{Tuple{Int64,Int64},1})
    pair1 = (P1,P2)
    pair2 = (P2,P1)
    if pair1 ∉ PoiPair
        if pair2 ∉ PoiPair
            push!(PoiPair,pair1)
        end
    end
    return PoiPair
end
NumNod = 1000
Dim = 2
A = rand(Dim,NumNod)
s = 0.1;
a = FRNNMain(NumNod,Dim,A,s)

此时运算时间为

@btime FRNNMain(NumNod,Dim,A,s)
  209.789 ms (961 allocations: 670.45 KiB)

但是把for循环的程序也按照上面的思路修改了一下,程序为

NumNod = 1000
Dim = 2
A = rand(Dim,NumNod)
s = 0.1;
function forloop(A::Matrix{Float64},NumNod::Int64,s::Float64)
    PoiPair = Array{Tuple{Int64,Int64},1}(undef,1)
    popfirst!(PoiPair)
    for i = 1:NumNod-1
        for j = i+1:NumNod
            Dist = (A[1,i]-A[1,j])^2+(A[2,i]-A[2,j])^2
            if Dist < s^2
                push!(PoiPair,(i,j))
            end
        end
    end
    return PoiPair
end
b = forloop(A,NumNod,s)

这个算得更快了,运行时间为

@btime forloop(A,NumNod,s)
  972.200 μs (14 allocations: 512.67 KiB)

感到深深的绝望…怎么能算得这么慢

再贴一个code_warntype的结果

@code_warntype FRNNMain(NumNod,Dim,A,s)
Variables
  #self#::Core.Compiler.Const(FRNNMain, false)
  NumNod::Int64
  Dim::Int64
  A::Array{Float64,2}
  s::Float64
  r::Float64
  PoiPair::Array{Tuple{Int64,Int64},1}
  Cell::Dict{Tuple{Int64,Int64},Array{Int64,1}}
  @_9::Union{Nothing, Tuple{Tuple{Int64,Int64},Int64}}
  key::Tuple{Int64,Int64}
  vaule::Array{Int64,1}

Body::Array{Tuple{Int64,Int64},1}
1 ─ %1  = Main.sqrt(Dim)::Float64
│         (r = s / %1)
│   %3  = Core.apply_type(Main.Tuple, Main.Int64, Main.Int64)::Core.Compiler.Const(Tuple{Int64,Int64}, false)
│   %4  = Core.apply_type(Main.Array, %3, 1)::Core.Compiler.Const(Array{Tuple{Int64,Int64},1}, false)
│         (PoiPair = (%4)(Main.undef, 1))
│         Main.popfirst!(PoiPair)
│         (Cell = Main.PoiCell(A, NumNod, r))
│   %8  = Main.keys(Cell)::Base.KeySet{Tuple{Int64,Int64},Dict{Tuple{Int64,Int64},Array{Int64,1}}}
│         (@_9 = Base.iterate(%8))
│   %10 = (@_9 === nothing)::Bool
│   %11 = Base.not_int(%10)::Bool
└──       goto #4 if not %11
2 ┄ %13 = @_9::Tuple{Tuple{Int64,Int64},Int64}::Tuple{Tuple{Int64,Int64},Int64}
│         (key = Core.getfield(%13, 1))
│   %15 = Core.getfield(%13, 2)::Int64
│         (vaule = Base.getindex(Cell, key))
│         Main.InnerCellPair!(vaule, PoiPair)
│         Main.LoopNeighbor!(key, Cell, PoiPair, A, s)
│         (@_9 = Base.iterate(%8, %15))
│   %20 = (@_9 === nothing)::Bool
│   %21 = Base.not_int(%20)::Bool
└──       goto #4 if not %21
3 ─       goto #2
4 ┄       return PoiPair

有可能是惰性函数的问题2020-03-27 12-53-55 的屏幕截图 2020-03-27 12-54-04 的屏幕截图
我猜想在你调用第一个函数的时候,@time把得到rand(100,2)所有元素的时间算进去了


试试换个顺序,先用forloop


纠错:Fak,看来我可能弄错了,不过我还是想看一下换个顺序会怎么样


确实有点慢,不知道是不是数据结构的问题,查询Dist内容可能费点时间

我感觉你给的测试条件压力不大。

julia 的 for 很给力。小样本量基本优势很大,大样本才能看出算法上的差距

改变数组大小 NumNod

julia> for NumNod in collect(1:10) * 1000
           A = rand(NumNod,Dim)*100;
           @time forloop(A,NumNod,s);
       end
  0.196111 seconds (999.02 k allocations: 91.462 MiB, 51.76% gc time)
  0.249301 seconds (4.00 M allocations: 366.031 MiB, 19.79% gc time)
  0.397443 seconds (9.00 M allocations: 823.709 MiB, 3.72% gc time)
  1.482190 seconds (16.00 M allocations: 1.430 GiB, 2.91% gc time)
  1.201030 seconds (25.00 M allocations: 2.235 GiB, 3.60% gc time)
  1.675108 seconds (35.99 M allocations: 3.218 GiB, 3.59% gc time)
  2.311076 seconds (48.99 M allocations: 4.380 GiB, 3.65% gc time)
  4.310118 seconds (63.99 M allocations: 5.721 GiB, 3.03% gc time)

julia> for NumNod in collect(1:10) * 10_000
           A = rand(NumNod,Dim)*100;
           @time forloop(A,NumNod,s);
       end
  4.261473 seconds (99.99 M allocations: 8.940 GiB, 4.27% gc time)
 18.061562 seconds (399.99 M allocations: 35.761 GiB, 4.63% gc time)
 37.787276 seconds (899.98 M allocations: 80.464 GiB, 4.09% gc time)
114.836089 seconds (1.60 G allocations: 143.049 GiB, 3.38% gc time)
^C
julia> for NumNod in collect(1:10) * 1000
           A = rand(NumNod,Dim)*100;
           @time FRNNMain(NumNod,Dim,A,s);
       end
  0.037698 seconds (55.18 k allocations: 2.433 MiB)
  0.054817 seconds (92.95 k allocations: 3.535 MiB)
  0.005024 seconds (76.64 k allocations: 2.737 MiB)
  0.010408 seconds (103.70 k allocations: 3.610 MiB)
  0.022965 seconds (134.55 k allocations: 4.594 MiB)
  0.022106 seconds (167.60 k allocations: 5.660 MiB)
  0.040841 seconds (213.71 k allocations: 7.119 MiB, 33.68% gc time)
  0.025413 seconds (270.34 k allocations: 8.902 MiB)
  0.042115 seconds (348.90 k allocations: 11.345 MiB)
  0.038399 seconds (475.34 k allocations: 15.266 MiB)

julia> for NumNod in collect(1:10) * 10_000
           A = rand(NumNod,Dim)*100;
           @time FRNNMain(NumNod,Dim,A,s);
       end
  0.039367 seconds (441.21 k allocations: 14.224 MiB, 25.38% gc time)
  0.050425 seconds (539.30 k allocations: 18.547 MiB)
  0.089731 seconds (961.16 k allocations: 32.120 MiB, 13.99% gc time)
  0.184787 seconds (1.80 M allocations: 58.545 MiB, 7.14% gc time)
  0.177084 seconds (1.32 M allocations: 47.563 MiB, 20.97% gc time)
  0.241691 seconds (1.57 M allocations: 56.436 MiB, 25.85% gc time)
  0.224645 seconds (1.87 M allocations: 66.499 MiB, 5.91% gc time)
  0.279534 seconds (2.18 M allocations: 77.317 MiB, 5.94% gc time)
  0.348275 seconds (2.52 M allocations: 88.803 MiB, 6.01% gc time)
  0.521723 seconds (2.91 M allocations: 101.983 MiB, 7.30% gc time)

julia> for NumNod in collect(1:10) * 100_000
           A = rand(NumNod,Dim)*100;
           @time FRNNMain(NumNod,Dim,A,s);
       end
  0.535916 seconds (2.91 M allocations: 102.119 MiB, 9.12% gc time)
  5.450482 seconds (6.25 M allocations: 225.615 MiB, 4.79% gc time)
 28.951260 seconds (11.60 M allocations: 416.061 MiB, 1.47% gc time)
 87.724544 seconds (13.12 M allocations: 504.621 MiB, 0.63% gc time)
336.967927 seconds (17.41 M allocations: 680.904 MiB, 0.22% gc time)
^C

改变分散程度

预计 for 不会有太大的变化

julia> NumNod = 5000
5000

julia> for scale in 1:10
           A = rand(NumNod,Dim)*scale*100;
           @time forloop(A,NumNod,s);
       end
  1.398829 seconds (25.00 M allocations: 2.235 GiB, 3.46% gc time)
  1.336941 seconds (25.00 M allocations: 2.235 GiB, 3.55% gc time)
  1.528647 seconds (25.00 M allocations: 2.235 GiB, 3.32% gc time)
  1.312679 seconds (25.00 M allocations: 2.235 GiB, 3.64% gc time)
  1.321496 seconds (25.00 M allocations: 2.235 GiB, 3.57% gc time)
  1.362860 seconds (25.00 M allocations: 2.235 GiB, 3.53% gc time)
  1.601800 seconds (25.00 M allocations: 2.235 GiB, 3.53% gc time)
  1.664082 seconds (25.00 M allocations: 2.235 GiB, 3.60% gc time)
  2.062444 seconds (25.00 M allocations: 2.235 GiB, 2.87% gc time)
  2.135823 seconds (25.00 M allocations: 2.235 GiB, 2.97% gc time)

划分网格的当然是越稀疏效果越好

julia> NumNod = 200_000
200000

julia> for scale in 1:10
           A = rand(NumNod,Dim)*scale*100;
           @time FRNNMain(NumNod,Dim,A,s);
       end
  7.165528 seconds (6.25 M allocations: 225.755 MiB, 4.89% gc time)
  1.157459 seconds (6.20 M allocations: 210.068 MiB, 12.50% gc time)
  1.126173 seconds (6.16 M allocations: 206.322 MiB, 31.87% gc time)
  0.874867 seconds (6.18 M allocations: 205.859 MiB, 16.87% gc time)
  1.068134 seconds (6.16 M allocations: 205.089 MiB, 25.65% gc time)
  1.082324 seconds (6.17 M allocations: 205.182 MiB, 23.96% gc time)
  1.161492 seconds (6.18 M allocations: 205.316 MiB, 24.34% gc time)
  1.142114 seconds (6.18 M allocations: 205.177 MiB, 23.17% gc time)
  0.886259 seconds (6.17 M allocations: 204.759 MiB, 20.16% gc time)
  1.234273 seconds (6.18 M allocations: 204.897 MiB, 28.85% gc time)

julia> NumNod = 300_000
300000

julia> for scale in 1:10
           A = rand(NumNod,Dim)*scale*100;
           @time FRNNMain(NumNod,Dim,A,s);
       end
 29.940059 seconds (11.67 M allocations: 418.380 MiB, 1.40% gc time)
  3.063644 seconds (12.31 M allocations: 407.787 MiB, 7.55% gc time)
  2.303256 seconds (12.58 M allocations: 409.451 MiB, 16.61% gc time)
  1.653237 seconds (12.58 M allocations: 407.519 MiB, 18.47% gc time)
  1.580190 seconds (12.66 M allocations: 409.065 MiB, 13.93% gc time)
  4.677850 seconds (12.64 M allocations: 407.666 MiB, 18.80% gc time)
  2.241075 seconds (12.70 M allocations: 409.340 MiB, 19.28% gc time)
  3.397634 seconds (12.69 M allocations: 408.701 MiB, 8.65% gc time)
  2.453740 seconds (12.70 M allocations: 408.790 MiB, 21.53% gc time)
  1.601407 seconds (12.69 M allocations: 408.511 MiB, 21.20% gc time)

julia> 

另外你的 GetNeighborCell 估计有点问题,你得计算所有的 N

因为格子边长为 r = s/sqrt(d),二维有 r = s/sqrt(2)r < s < 2r = sqrt(2)*r
而格子的对角线刚好是 s。在纸上画一画就知道了。

   NNN    index deltas:    (-1,  2) ( 0,  2) ( 1,  2)
  NNNNN           (-2,  1) (-1,  1) ( 0,  1) ( 1,  1) ( 2,  1)
  NNANN           (-2,  0) (-1,  0) ( 0,  0) ( 1,  0) ( 2,  0)
  NNNNN           (-2, -1) (-1, -1) ( 0, -1) ( 1, -1) ( 2, -1)
   NNN                     (-1, -2) ( 0, -2) ( 1, -2)

对的,我刚刚睡觉迷迷糊糊也想到这个了

我想直接把cell尺寸直接设成s,这样虽然要多搜一下自己,但是可以少搜好几个cell。

重要的是实际搜索的面积,你算一算用 s/{\sqrt 2} 这样搜的次数多但是面积小。
图中就是 s/{\sqrt 2}\times s \times4 + \pi s^2 = (\pi + 2\sqrt2) s^2 \approx 5.96 s^2

那么 A 区块的对角线距离就大于 s 了,这样你就需要搜索 A 区块。
一共 9 个区块,面积为 9s^2 ,实际是得不偿失的。

高维就是体积了。

sf 的评论里提到过 r 的选取问题。

对,我刚刚试了一下搜自己的话反而算得慢了。
另一个想问的问题是你在A后面乘了100以后有没有把s也相应的扩大?
我这边算下来的结果是只把A扩大100倍以后,s仍取0.1,FRNNMain的速度确实比forloop 快

NumNod = 1000
Dim = 2
s = 0.1
println("Cell List")
for i = collect(1:10)*NumNod
    A = rand(Dim,i)*100
    @time FRNNMain(NumNod,Dim,A,s)
end
println("For Loop")
for i = collect(1:10)*NumNod
    A = rand(Dim,i)*100
    @time forloop(A,NumNod,s)
end

运行结果是

Cell List
  0.001097 seconds (3.02 k allocations: 1010.063 KiB)
  0.000870 seconds (3.02 k allocations: 1010.141 KiB)
  0.000847 seconds (3.02 k allocations: 1010.063 KiB)
  0.000886 seconds (3.02 k allocations: 1010.141 KiB)
  0.000884 seconds (3.02 k allocations: 1010.141 KiB)
  0.001272 seconds (3.02 k allocations: 1010.281 KiB)
  0.000889 seconds (3.02 k allocations: 1010.063 KiB)
  0.001526 seconds (3.02 k allocations: 1010.063 KiB)
  0.000702 seconds (3.02 k allocations: 1009.313 KiB)
  0.000887 seconds (3.02 k allocations: 1010.063 KiB)
For Loop
  0.001759 seconds (1.00 k allocations: 3.961 MiB)
  0.005201 seconds (1.00 k allocations: 3.961 MiB)
  0.003741 seconds (1.00 k allocations: 3.961 MiB)
  0.003309 seconds (1.00 k allocations: 3.961 MiB)
  0.004977 seconds (1.00 k allocations: 3.961 MiB)
  0.003765 seconds (1.00 k allocations: 3.961 MiB)
  0.004728 seconds (1.00 k allocations: 3.961 MiB)
  0.003427 seconds (1.00 k allocations: 3.961 MiB)
  0.003933 seconds (1.00 k allocations: 3.961 MiB)
  0.016121 seconds (1.00 k allocations: 3.961 MiB, 75.12% gc time)

这种情况即为距离小于s的点几乎没有,因为所有点的坐标都扩大了100倍,所以给在loop的时候几乎没有neighbor cell在字典里,所以很快。
但是如果我同时把s也乘了100,程序是

NumNod = 1000
Dim = 2
s = 0.1*100
println("Cell List")
for i = collect(1:10)*NumNod
    A = rand(Dim,i)*100
    @time FRNNMain(NumNod,Dim,A,s)
end
println("For Loop")
for i = collect(1:10)*NumNod
    A = rand(Dim,i)*100
    @time forloop(A,NumNod,s)
end

运行结果是

Cell List
  0.150363 seconds (978 allocations: 751.547 KiB)
  0.142117 seconds (965 allocations: 748.063 KiB)
  0.139737 seconds (963 allocations: 747.750 KiB)
  0.150158 seconds (956 allocations: 746.719 KiB)
  0.146481 seconds (979 allocations: 752.359 KiB)
  0.142613 seconds (958 allocations: 747.078 KiB)
  0.138542 seconds (974 allocations: 752.422 KiB)
  0.137803 seconds (966 allocations: 747.016 KiB)
  0.141860 seconds (977 allocations: 750.859 KiB)
  0.149367 seconds (972 allocations: 750.219 KiB)
For Loop
  0.002087 seconds (1.01 k allocations: 4.461 MiB)
  0.002893 seconds (1.01 k allocations: 4.461 MiB)
  0.004925 seconds (1.01 k allocations: 4.461 MiB)
  0.003822 seconds (1.01 k allocations: 4.461 MiB)
  0.015138 seconds (1.01 k allocations: 4.461 MiB, 76.00% gc time)
  0.001978 seconds (1.01 k allocations: 4.461 MiB)
  0.004069 seconds (1.01 k allocations: 4.461 MiB)
  0.003221 seconds (1.01 k allocations: 4.461 MiB)
  0.003352 seconds (1.01 k allocations: 4.461 MiB)
  0.003903 seconds (1.01 k allocations: 4.461 MiB)

FRNNMain的速度还是远慢于forloop

我测的时候没改变 s,因为你等比放大没意义啊。
我上面测得结果是 for 再 10_000 这个量级上时间就很长了,分块的能比他多一个量级。

小样本还可以直接构造 N*N 三角阵直接算。

嗯…不是说等比放大没意义,其实问题的关键是只放大A的话就相当于在FRNNMain里面人为跳过搜索过程了…
之前rand出来的坐标都在[0,1]之间,距离小于0.1的点对很多,这样每一个cell的neighbor cell很有可能在Cell这个字典里面,搜的时候就要多花一点时间;
如果是把rand出来的坐标乘了100,这样点的坐标就都在[1,100]里了,s还是0.1,距离小于0.1的点对很少。另一方面,算出来cell的顶点坐标之间的差异也很大,应该在百和千量级的差异上。一个cell的neighbor cell只是其坐标加减1,2,很有可能就不再Cell这个字典里面,搜索的过程就没有了,相当于计算过程只是把Cell的key循环了一遍,后面的搜索都没做。
forloop里面不管s是多少,每个点都要搜一遍,所以肯定比不搜来得慢鸭。
而且从实际意义出发,点之间的相对距离变大以后s也应该相应的变大鸭。
(困在家里的俺只有轻薄本,i5-8250U的CPU10000个点完全跑不动)