我要怎么优化这个算法?

我之前用Mxnet,学习的一个例子,python跑的时间比我自己用Julia写的要长,但准确度要高。(我觉得我基本是Mxnet的例子改成Julia,但不知道为啥会差点:joy:
python代码如下:

import d2lzh as d2l
from mxnet import gluon, init, nd, autograd
from mxnet.gluon import loss as gloss, nn

net = nn.Sequential()
net.add(nn.Dense(256, activation='relu'), nn.Dense(10))
net.initialize(init.Normal(sigma=0.01))
batch_size = 256
train_iter, test_iter = d2l.load_data_fashion_mnist(batch_size)
loss = gloss.SoftmaxCrossEntropyLoss()
trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': 0.5})
num_epochs = 10
%time d2l.train_ch3(net, train_iter, test_iter, loss, num_epochs, batch_size, None, None, trainer)

结果是:

epoch 1, loss 0.7997, train acc 0.702, test acc 0.821
epoch 2, loss 0.4910, train acc 0.817, test acc 0.853
epoch 3, loss 0.4233, train acc 0.845, test acc 0.860
epoch 4, loss 0.3942, train acc 0.853, test acc 0.862
epoch 5, loss 0.3715, train acc 0.862, test acc 0.872
epoch 6, loss 0.3519, train acc 0.869, test acc 0.872
epoch 7, loss 0.3396, train acc 0.874, test acc 0.875
epoch 8, loss 0.3239, train acc 0.880, test acc 0.880
epoch 9, loss 0.3168, train acc 0.883, test acc 0.864
epoch 10, loss 0.3050, train acc 0.886, test acc 0.879
CPU times: user 31.9 s, sys: 1.65 s, total: 33.6 s
Wall time: 34.7 s

Julia代码:

using Flux
using Flux: @epochs
using Statistics
using Random
using Parameters: @with_kw
using IterTools: ncycle 

@with_kw mutable struct Args
    lr::Float64 = 0.5
    batch_size::Int = 256
    repeat::Int = 20
end


images = Flux.Data.FashionMNIST.images()
labels = Flux.Data.FashionMNIST.labels()

function get_fashion_label(labels)
	text_label = ["t-shirt", "trouser", "pullover", "dress", "coat", "sandal", "shirt", "sneaker", "bag", "ankle boot"]
	return [text_label[i+1] for i in labels]
end

#get train_data

data_x = rand(784, 6000)
data_y = []

for i in 1:6000
	data_x[:,i] = Float64.(reshape(images[i],(784,1)))
	push!(data_y, get_fashion_label(labels)[i])
end

label = sort(unique(data_y))
data_onehot_labels = Flux.onehotbatch(data_y, label)

train_x = data_x[:, [1:3:6000 ; 2:3:6000]]
train_y = data_onehot_labels[:, [1:3:6000 ; 2:3:6000]]



test_x = data_x[:, 3:3:6000]
test_y = data_onehot_labels[:, 3:3:6000]



train_data = Flux.Data.DataLoader((train_x, train_y), batchsize=Args().batch_size, shuffle=true)

#model
model = Chain(
    Dense(784, 256, relu),
    Dense(256, 10)
    )

#define loss function: cross entropy function
loss(x, y) = Flux.logitcrossentropy(model(x), y)

# params
ps = Flux.params(model)

#SDG
opt = Descent(Args().lr)

@time Flux.train!(loss, ps, ncycle(train_data, Args().repeat),opt)

accuracy(x, y, model) = Flux.mean(Flux.onecold(model(x)) .== Flux.onecold(y))

print("train loss: ", accuracy(train_x, train_y, model), ", test loss:", accuracy(test_x, test_y, model))

结果:

12.571922 seconds (314.00 M allocations: 6.982 GiB, 6.54% gc time)
train loss: 0.8555, test loss:0.8285

:thinking:

额,测试时间的时候通常是用@btime your_code,不过要先 using BenchmarkTools

你这两个实现没有可比性吧……

单看模型,主要的差别就是参数初始化那块

我在jupyter里用@time,应该也是可以的吧。
我也试了用@btime测,少了2秒。

参数初始化不都是随机的吗?

我试了如果把数据全投进去,准确率就跟python差不多了。那应该跟算法没得关系