生信里经常会遇到需要根据某个/些个基因的ID,来查找目标序列的需求,这里自己搞个例子尝试下:
- 基因ID文件:
julia> less("ID.txt")
Gene1
Gene2
- 所有基因的序列文件:
julia> less("AllGene.fa")
>Gene1
ATTGGGGGGGG
ATTTGG
ATGGG
>Gene2
AGGGGGG
AGGGC
TTGT
>Gene3
AAAGGGGGGCCC
TTTGGG
根据之前Julia尝试将基因序列多行的fasta文件转为一行的经验和热心大佬们的建议,写了如下函数,还是希望各位大佬不吝赐教,提前感谢!
"输入:基因ID文件,所有基因ID的Fasta文件。结果:返回目标基因的序列fa文件"
function GeneID2Fa(GeneIDFile::AbstractString, AllFaFile::AbstractString)
path_split = splitpath(abspath(GeneIDFile))
path_split[end] = "$(path_split[end]).fasta"
ResFile = joinpath(path_split...)
io = open(ResFile,"w")
Seq = Dict{AbstractString,AbstractString}()
GeneName = ""
open(AllFaFile) do Fafile
for Fa in eachline(Fafile)
if(startswith(Fa,">"))
GeneName = chomp(replace(Fa,">"=>""))
Seq[GeneName] = ""
else
Seq[GeneName] *= chomp(Fa)
end
end
end
open(GeneIDFile) do IdFile
for id in eachline(IdFile)
if(!isempty(Seq[id]))
write(io,">"*id*"\n")
write(io,Seq[id]*"\n")
end
end
end
close(io)
end
julia> methods(GeneID2Fa)
# 1 method for generic function "GeneID2Fa":
[1] GeneID2Fa(GeneIDFile::AbstractString, AllFaFile::AbstractString; ResFile) in Main at REPL[123]:1
help?> GeneID2Fa
search: GeneID2Fa
输入:基因ID文件,所有基因ID的Fasta文件。结果:返回目标基因的序列fa文件
julia> GeneID2Fa("ID.txt","AllGene.fa")
julia> less("ID.txt.fasta")
>Gene1
ATTGGGGGGGGATTTGGATGGG
>Gene2
AGGGGGGAGGGCTTGT