wc
wo@wos-ThinkPad:/mnt/h$ time wc hg38.fa
64186394 64186394 3273481150 hg38.fa
real 0m47.166s
user 0m41.813s
sys 0m2.719s
julia
原始代码
julia 原始测试代码
function lineGC(seq::String)
GCnumber=count(x->(x=='G'||x=='C'||x=='g'||x=='c'),seq)
lineNum=count(x->(x!='N' && x!='n'),seq)
(GCnumber,lineNum)
end
function calGC(fs)
GCnumber=zero(Int)
lineNum=zero(Int)
open(fs,"r") do IOstream
for line in eachline(IOstream)
if startswith(line,">")
continue
else
GC,all=lineGC(line)
GCnumber+=GC
lineNum+=all
end
end
end
# round(GCnumber/lineNum;digits=3)
GCnumber, lineNum
end
# println("GC content: ",calGC(ARGS[1]))
function main()
path = raw"H:\hg38.fa"
GCnumber, lineNum = calGC(path)
println("GC=$(GCnumber); total=$(lineNum); frac=", GCnumber/lineNum)
end
julia> include(raw"C:\Users\woclass\Desktop\GitHub\gc\julia\JuliaCN\4351-jl-perl\4351-ori.jl")
main (generic function with 1 method)
julia> using BenchmarkTools
julia> @btime main()
GC=1250062479; total=3049315783; frac=0.409948515653618
GC=1250062479; total=3049315783; frac=0.409948515653618
GC=1250062479; total=3049315783; frac=0.409948515653618
GC=1250062479; total=3049315783; frac=0.409948515653618
43.173 s (256842214 allocations: 7.67 GiB)
julia> @time main()
GC=1250062479; total=3049315783; frac=0.409948515653618
44.693519 seconds (256.85 M allocations: 7.668 GiB, 0.79% gc time)
内存计数
- function lineGC(seq::String)
- GCnumber=count(x->(x=='G'||x=='C'||x=='g'||x=='c'),seq)
- lineNum=count(x->(x!='N' && x!='n'),seq)
- (GCnumber,lineNum)
- end
-
- function calGC(fs)
- GCnumber=zero(Int)
- lineNum=zero(Int)
- open(fs,"r") do IOstream
48 for line in eachline(IOstream)
0 if startswith(line,">")
- continue
- else
0 GC,all=lineGC(line)
1026971568 GCnumber+=GC
7206074416 lineNum+=all
- end
- end
- end
- # round(GCnumber/lineNum;digits=3)
- GCnumber, lineNum
- end
-
- # println("GC content: ",calGC(ARGS[1]))
- function main()
- path = raw"H:\hg38.fa"
64 GCnumber, lineNum = calGC(path)
496 println("GC=$(GCnumber); total=$(lineNum); frac=", GCnumber/lineNum)
- end
-
- main()
类型稳定测试
不知道为啥装箱了
julia> @code_warntype calGC("H:\\")
Variables
#self#::Core.Compiler.Const(calGC, false)
fs::String
#5::var"#5#6"
GCnumber@_4::Core.Box
lineNum@_5::Core.Box
GCnumber@_6::Union{}
lineNum@_7::Union{}
Body::Tuple{Any,Any}
1 ─ (GCnumber@_4 = Core.Box())
│ (lineNum@_5 = Core.Box())
│ %3 = Main.zero(Main.Int)::Core.Compiler.Const(0, false)
│ Core.setfield!(GCnumber@_4, :contents, %3)
│ %5 = Main.zero(Main.Int)::Core.Compiler.Const(0, false)
│ Core.setfield!(lineNum@_5, :contents, %5)
│ (#5 = %new(Main.:(var"#5#6"), GCnumber@_4, lineNum@_5))
│ %8 = #5::var"#5#6"
│ Main.open(%8, fs, "r")
│ %10 = Core.isdefined(GCnumber@_4, :contents)::Bool
└── goto #3 if not %10
2 ─ goto #4
3 ─ Core.NewvarNode(:(GCnumber@_6))
└── GCnumber@_6
4 ┄ %15 = Core.getfield(GCnumber@_4, :contents)::Any
│ %16 = Core.isdefined(lineNum@_5, :contents)::Bool
└── goto #6 if not %16
5 ─ goto #7
6 ─ Core.NewvarNode(:(lineNum@_7))
└── lineNum@_7
7 ┄ %21 = Core.getfield(lineNum@_5, :contents)::Any
│ %22 = Core.tuple(%15, %21)::Tuple{Any,Any}
└── return %22
直接使用 eachline 读文件
修改后代码
function lineGC(seq::String)
GCnumber=count(x->(x=='G'||x=='C'||x=='g'||x=='c'),seq)
lineNum=count(x->(x!='N' && x!='n'),seq)
(GCnumber,lineNum)
end
function calGC(fs)
GCnumber=zero(Int)
lineNum=zero(Int)
for line in eachline(fs)
if startswith(line,">")
continue
else
GC,all=lineGC(line)
GCnumber+=GC
lineNum+=all
end
end
# round(GCnumber/lineNum;digits=3)
GCnumber, lineNum
end
# println("GC content: ",calGC(ARGS[1]))
function main()
path = raw"H:\hg38.fa"
GCnumber, lineNum = calGC(path)
println("GC=$(GCnumber); total=$(lineNum); frac=", GCnumber/lineNum)
end
julia> @btime main()
GC=1250062479; total=3049315783; frac=0.409948515653618
GC=1250062479; total=3049315783; frac=0.409948515653618
GC=1250062479; total=3049315783; frac=0.409948515653618
GC=1250062479; total=3049315783; frac=0.409948515653618
38.196 s (128470761 allocations: 5.75 GiB)
julia> @time main()
GC=1250062479; total=3049315783; frac=0.409948515653618
46.266453 seconds (128.47 M allocations: 5.755 GiB, 0.70% gc time)
内存计数
- function lineGC(seq::String)
- GCnumber=count(x->(x=='G'||x=='C'||x=='g'||x=='c'),seq)
- lineNum=count(x->(x!='N' && x!='n'),seq)
- (GCnumber,lineNum)
- end
-
- function calGC(fs)
- GCnumber=zero(Int)
- lineNum=zero(Int)
-
800 for line in eachline(fs)
0 if startswith(line,">")
- continue
- else
0 GC,all=lineGC(line)
0 GCnumber+=GC
6179102752 lineNum+=all
- end
- end
-
- # round(GCnumber/lineNum;digits=3)
0 GCnumber, lineNum
- end
-
- # println("GC content: ",calGC(ARGS[1]))
- function main()
- path = raw"H:\hg38.fa"
0 GCnumber, lineNum = calGC(path)
512 println("GC=$(GCnumber); total=$(lineNum); frac=", GCnumber/lineNum)
- end
-
- main()
类型稳定测试
julia> @code_warntype calGC("H:\\")
Variables
#self#::Core.Compiler.Const(calGC, false)
fs::String
GCnumber::Int64
lineNum::Int64
@_5::Union{Nothing, Tuple{String,Nothing}}
line::String
GC::Int64
@_8::Int64
all::Int64
Body::Tuple{Int64,Int64}
1 ─ (GCnumber = Main.zero(Main.Int))
│ (lineNum = Main.zero(Main.Int))
│ %3 = Main.eachline(fs)::Base.EachLine{IOStream}
│ (@_5 = Base.iterate(%3))
│ %5 = (@_5 === nothing)::Bool
│ %6 = Base.not_int(%5)::Bool
└── goto #7 if not %6
2 ┄ Core.NewvarNode(:(GC))
│ Core.NewvarNode(:(@_8))
│ Core.NewvarNode(:(all))
│ %11 = @_5::Tuple{String,Nothing}::Tuple{String,Nothing}
│ (line = Core.getfield(%11, 1))
│ %13 = Core.getfield(%11, 2)::Core.Compiler.Const(nothing, false)
│ %14 = Main.startswith(line, ">")::Bool
└── goto #4 if not %14
3 ─ goto #5
4 ─ %17 = Main.lineGC(line)::Tuple{Int64,Int64}
│ %18 = Base.indexed_iterate(%17, 1)::Core.Compiler.PartialStruct(Tuple{Int64,Int64}, Any[Int64, Core.Compiler.Const(2, false)])
│ (GC = Core.getfield(%18, 1))
│ (@_8 = Core.getfield(%18, 2))
│ %21 = Base.indexed_iterate(%17, 2, @_8::Core.Compiler.Const(2, false))::Core.Compiler.PartialStruct(Tuple{Int64,Int64}, Any[Int64, Core.Compiler.Const(3, false)])
│ (all = Core.getfield(%21, 1))
│ (GCnumber = GCnumber + GC)
└── (lineNum = lineNum + all)
5 ┄ (@_5 = Base.iterate(%3, %13))
│ %26 = (@_5 === nothing)::Bool
│ %27 = Base.not_int(%26)::Bool
└── goto #7 if not %27
6 ─ goto #2
7 ┄ %30 = Core.tuple(GCnumber, lineNum)::Tuple{Int64,Int64}
└── return %30
改了之后内存分配计数基本下降一半,但时间上的占比依旧很小。
问题出在计算部分。
用数组统计
julia 代码
function countAll!(arr::Vector{Int64}, seq::AbstractString)
for c in seq
arr[Int(c)] += 1
end
end
function countChar(fname)
arr = zeros(Int64, 128)
for line in eachline(fname)
if startswith(line,">")
continue
else
countAll!(arr, line)
end
end
arr
end
function main()
path = raw"H:\hg38.fa"
arr = countChar(path)
G = arr[Int('G')] + arr['g' |> Int]
C = arr['C'|> Int] + arr['c'|> Int]
total = G+C + arr['A'|> Int] + arr['a'|> Int] +arr['T'|> Int] + arr['t'|> Int]
println("GC=$(G+C); total=$(total); frac=", (G+C)/total)
end
main()
julia> @btime main()
GC=1250062479; total=3049315783; frac=0.409948515653618
GC=1250062479; total=3049315783; frac=0.409948515653618
GC=1250062479; total=3049315783; frac=0.409948515653618
GC=1250062479; total=3049315783; frac=0.409948515653618
27.584 s (128470762 allocations: 5.75 GiB)
julia> @time main()
GC=1250062479; total=3049315783; frac=0.409948515653618
29.422292 seconds (128.47 M allocations: 5.755 GiB, 0.84% gc time)
类型稳定性
julia> @code_warntype countChar("H:\\")
Variables
#self#::Core.Compiler.Const(countChar, false)
fname::String
arr::Array{Int64,1}
@_4::Union{Nothing, Tuple{String,Nothing}}
line::String
Body::Array{Int64,1}
1 ─ (arr = Main.zeros(Main.Int64, 128))
│ %2 = Main.eachline(fname)::Base.EachLine{IOStream}
│ (@_4 = Base.iterate(%2))
│ %4 = (@_4 === nothing)::Bool
│ %5 = Base.not_int(%4)::Bool
└── goto #7 if not %5
2 ┄ %7 = @_4::Tuple{String,Nothing}::Tuple{String,Nothing}
│ (line = Core.getfield(%7, 1))
│ %9 = Core.getfield(%7, 2)::Core.Compiler.Const(nothing, false)
│ %10 = Main.startswith(line, ">")::Bool
└── goto #4 if not %10
3 ─ goto #5
4 ─ Main.countAll!(arr, line)
5 ┄ (@_4 = Base.iterate(%2, %9))
│ %15 = (@_4 === nothing)::Bool
│ %16 = Base.not_int(%15)::Bool
└── goto #7 if not %16
6 ─ goto #2
7 ┄ return arr
java
用了一个 if-else 和一个数组统计的方法。
java 代码
baidu 刚刚上手,写的比较挫
package com.company;
import java.io.BufferedReader;
import java.io.FileReader;
import java.io.IOException;
public class Main {
static long[] freqs = new long[128];
static long G = 0;
static long C = 0;
static double total = 0.0;
public static void countLettersIf(String filename) throws IOException{
try(BufferedReader in = new BufferedReader(new FileReader(filename))){
String line;
while((line = in.readLine()) != null){
for(char ch:line.toCharArray()){
if (ch=='>') {
break;
} else if(ch=='G' || ch=='g') {
G++;
} else if(ch=='C' || ch=='c'){
C++;
}
if (ch!='N' && ch!='n') {
total++;
}
}
}
}
}
public static long[] countLettersArray(String filename) throws IOException{
try(BufferedReader in = new BufferedReader(new FileReader(filename))){
String line;
while((line = in.readLine()) != null){
for(char ch:line.toCharArray()){
if (ch=='>') {
break;
}
freqs[ch]++;
}
}
}
return freqs;
}
public static void printRes() {
System.out.println("G=" + G + "; C=" + C + "; GC=" + (G+C));
System.out.println("total=" + (long) total);
System.out.println("frac=" + ((G+C) / total));
}
public static void main(String[] args) throws IOException{
long starTime = System.currentTimeMillis();
countLettersIf("H:\\hg38.fa");
long endTime = System.currentTimeMillis();
long Time = endTime - starTime;
System.out.println("if-else style Time=" + Time/1000.0 + "s");
printRes();
System.out.println("----------------");
starTime = System.currentTimeMillis();
countLettersArray("H:\\hg38.fa");
endTime = System.currentTimeMillis();
Time = endTime - starTime;
System.out.println("array style Time=" + Time/1000.0 + "s");
G = freqs['G'] + freqs['g'];
C = freqs['C'] + freqs['c'];
total = G+C+freqs['A'] + freqs['a']+freqs['T'] + freqs['t'];
printRes();
}
}
if-else style Time=42.075s
G=626335137; C=623727342; GC=1250062479
total=3049315783
frac=0.409948515653618
----------------
array style Time=17.056s
G=626335137; C=623727342; GC=1250062479
total=3049315783
frac=0.409948515653618