两个for循环怎么才能更快？

zhangchunyong · 2022 年9 月 19 日 06:20

function conver(y,string)
    ls=length(string)
    if ls >= y+2
        s=string[y+1:y+2]
        if 'N' in s
            return missing
        else
            return s=="CG"
        end
    end        
end
function classify_reads(index,match_read_cpg,starts_cpgs,starts_reads,seqs_reads,overlapcopy)
    covered_cpgs = match_read_cpg[index][:,2]
    if length(covered_cpgs)<4
        return missing
    end
    start_cpgs=starts_cpgs[covered_cpgs]
    start_of_read=starts_reads[index]
    start_cpgs = start_cpgs .- start_of_read
    sequence=seqs_reads[index]
    representation=[]
    for i in range(1,length(start_cpgs))
        c=conver(start_cpgs[i],String(sequence))
        if isequal(c,missing)|isequal(c,nothing)
            deleteat!(overlapcopy,findall(overlapcopy.queryHits.==index .&& overlapcopy.subjectHits.==covered_cpgs[i]))
        else
            push!(representation,c)
        end
    end
    if length(representation)<4
        return missing
    end
    concordant = (all(representation) || all(.!representation))
    return !concordant
end

function calculatestate(classified_reads,match_read_cpg,starts_cpgs,starts_reads,seqs_reads,overlapcopy)
    p=[]
    for i in classified_reads
        #println(i)
        a=classify_reads(i,match_read_cpg,starts_cpgs,starts_reads,seqs_reads,overlapcopy)
        push!(p,a)
    end
    p
end

我的match_read_cpg（函数的第二个参数）是一个groupeddataframe，我需要每次循环其中的一个dataframe去做处理。在执行calculatestate这个函数时，我用了第一个for循环。在其中有一个a=classify_reads(i,match_read_cpg,starts_cpgs,starts_reads,seqs_reads,overlapcopy)函数，这个函数里面又有一个for循环。
我其实在classify_reads这个函数里面，在执行conver时想用广播，但是无奈我在执行后还要去判断，去删除overlapcopy里面的东西。所以就相当于嵌套了两个for。这一步就慢下来了，各位大佬能给看看怎么解决么？另外还有哪个地可以更改下提高性能呢？

AquaIndigo · 2022 年9 月 19 日 07:02

先把Any[]替换成明确类型的数组

xgdgsc · 2022 年9 月 19 日 07:16

deleteat!
执行的次数多吗，多的话最好在循环里收集要删除的index，循环外删除一次

RexWzh · 2022 年9 月 19 日 07:50

第一个函数等价写法

function conver(ind, str)
    ls = length(str)
    ls < ind + 2 && return nothing
    s = SubString(str, ind+1, ind+2)
    if 'N' in s
        return missing
    else
        return s=="CG"
    end
end

这里的几点建议：

string 和库函数重名，虽然是局部变量不影响，但仍建议用其他名称，避免误操作
原代码第三行 if 判断为否后，将返回 nothing，这样一来函数返回值共有三种可能：Nothing, Missing, Bool，返回值不稳定将会影响代码性能
子串用 SubString 索引可以减少内存创建（1.7+ 版本子串可以用 @view 切片）

关于第二个函数：

从可读性角度，建议标上数据类型，比如

function classify_reads(index::Int 
                       , match_read_cpg::xxx
                       , starts_cpgs::xxx
                       , starts_reads::xxx
                       , seqs_reads::xxx
                       , overlapcopy::xxx)
    ...
end

空数组按类型创建，直接使用 [] 创建的是 Any 类型数据，性能降低，比如这里 representation = Vector{Bool}() 或 r =Bool[]