修改程序-最近近邻算法函数


#1

@Roger:我修改了一下你的帖子,下次请注意使用markdown的代码模块来发帖子,否则很难读。

此外请贴出你的MWE(最小可执行示例),因为大段的代码读起来非常的痛苦,也很难让别人帮助你发现问题。

function distance{T<:Number}(x::Array{T,1},y::Array{T,1})
       dist=0
       for i in 1:length(x)
       dist +=(x[i]-y[i])^2
       end
       dist=sqrt(dist)
       return dist
end
In[2]:function classify{T<:Any}(distances::Array{Float64,1},
              labels::Array{T,1},k::Int64)
              class=unique(labels)
nc=length(class)
indexes=Array(Int,0)
M=typemax(typeof(distances[1]))
class_count=Array(Int,nc)
for i in 1:k
       indexes[i]=typemin(distances)
       distances[indexes[i]]=M
       end
klabels=labels[indexes]
for i in 1:nc
       for j in 1:k
       if klabels[j]==class[i]
       class_count[i]+=1
       break
       end
       end
       end
index=typemax(class_count)
return class[index]
end
In[3]
function apply_KNN{T1<:Number,T2<:Any}(X::Array{T1,2},
x::Array{T2,1},Y::Array{T1,2},k::Int64)
N=size(X,1)
n=size(Y,1)
D=Array(Float64,N)
Z=Array(typeof(x[1]),n)
for i in 1:n
       for j in 1:N
       D[j]=distance(X[j,:],Y[i,:])
end
z[i]=classify(D,x,k)
end
return z
end
data=readcsv("magic04.csv")
I=map(Float64,data[:,1:(end-1)])
O=data[:,end]
N=length(O)
n=round(Int64,N/2)
R=randperm(N)
indX=R[1:n]
X=I[indX,:]
x=O[indX]
indY=R[(n+1):end]
Y=I[indY,:]
y=O[indY]
Iz=apply_KNN(X,x,Y,5)
println(sum(y.==z[1])/n)
Println(z[1][1:5],z[2][1:5])

库函数的正确用法?
#2

虽然我不会,但是还是建议你先百度一下 “markdown 代码块” 然后再贴代码,你这样的代码… 估计所有人看了,都不会有改的欲望 :joy:


#3

关键的错误原因没有贴


#4

给点数据喂啦。或者说一下 csv 放数据的要求


改一下 Deprecated syntax + 排版

# KNN (k-nearest neighbor)

# L_2 欧氏距离
function distance(
          x::Array{T, 1}
        , y::Array{T, 1}
    ) where T <: Number
    
    dist = 0
    for i in 1:length(x)
        dist += (x[i]-y[i])^2
    end
    dist = sqrt(dist)
    return dist
end


function classify(  
          distances::Array{Float64, 1}
        , labels::Array{T, 1}
        , k::Int64
    )  where T <: Any

    class = unique(labels)
    nc    = length(class)
    indexes     = Array(Int, 0)
    M           = typemax(typeof(distances[1]))
    class_count = Array(Int,nc)
    
    for i in 1:k
        indexes[i] = typemin(distances) # Inf
        distances[indexes[i]] = M # 不会数组越界?
    end

    klabels = labels[indexes]
    for i in 1:nc
        for j in 1:k
            if klabels[j] == class[i]
                class_count[i] += 1
                break
            end
        end
    end

    index = typemax(class_count) # 想用 max 然后取对应的 index?
    return class[index]
end


function apply_KNN(
          X::Array{T1, 2}
        , x::Array{T2, 1}
        , Y::Array{T1, 2}
        , k::Int64
    ) where {T1 <: Number, T2 <: Any}

    N = size(X, 1)
    n = size(Y, 1)
    D = Array(Float64, N)
    Z = Array(typeof(x[1]), n) # 未使用的变量?

    for i in 1:n
        for j in 1:N
            D[j] = distance(X[j,:], Y[i,:])
        end
        z[i] = classify(D, x, k) # 大小写有问题?
    end

    return z
end


# Test
data = readdlm("tst.csv", ',')

I = map(Float64, data[:, 1:(end-1)])
O = data[:, end]
N = length(O)
n = round(Int64, N/2)
R = randperm(N)

indX = R[1:n]
X = I[indX, :]
x = O[indX]

indY = R[(n+1): end]
Y = I[indY, :]
y = O[indY]

z = apply_KNN(X, x, Y, 5)

println(sum(y .== z[1])/n)
Println(z[1][1:5], z[2][1:5])

看上去问题出在 classify 函数里,逻辑有点混乱。中间 typemin / min, typemax / max 搞混了


#5

最近邻有现成的package: