From 16edd6029080ade3343fe2e88eb92a4668e25314 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Camilo=20Garc=C3=ADa?= Date: Fri, 17 May 2024 21:58:20 -0500 Subject: [PATCH 01/40] First steps towards ORFs as GenomicIntervals --- Manifest.toml | 52 ++++++++++++++++++- Project.toml | 2 + src/GeneFinder.jl | 1 + src/algorithms/naivefinder.jl | 21 +++++--- src/extended.jl | 18 +++++-- src/findorfs.jl | 1 + src/types.jl | 98 +++++++++++++++++++++++++++++------ 7 files changed, 165 insertions(+), 28 deletions(-) diff --git a/Manifest.toml b/Manifest.toml index 030f7d6..65da618 100644 --- a/Manifest.toml +++ b/Manifest.toml @@ -2,7 +2,7 @@ julia_version = "1.10.3" manifest_format = "2.0" -project_hash = "b881dd48d80d6b19dfa1756a0b2d6e2e42d93e33" +project_hash = "514456eec9673906fe896a3ae36685951a71c19f" [[deps.Automa]] deps = ["PrecompileTools", "TranscodingStreams"] @@ -10,6 +10,9 @@ git-tree-sha1 = "588e0d680ad1d7201d4c6a804dcb1cd9cba79fbb" uuid = "67c07d97-cdcb-5c2c-af73-a7f9c32a568b" version = "1.0.3" +[[deps.Base64]] +uuid = "2a0f44e3-6c83-55bd-87e4-b1978d98bd5f" + [[deps.BioGenerics]] deps = ["TranscodingStreams"] git-tree-sha1 = "7bbc085aebc6faa615740b63756e4986c9e85a70" @@ -42,6 +45,25 @@ git-tree-sha1 = "e32a61f028b823a172c75e26865637249bb30dff" uuid = "3c28c6f8-a34d-59c4-9654-267d177fcfa9" version = "5.1.3" +[[deps.Compat]] +deps = ["TOML", "UUIDs"] +git-tree-sha1 = "b1c55339b7c6c350ee89f2c1604299660525b248" +uuid = "34da2185-b29b-5c13-b0c7-acf172513d20" +version = "4.15.0" + + [deps.Compat.extensions] + CompatLinearAlgebraExt = "LinearAlgebra" + + [deps.Compat.weakdeps] + Dates = "ade2ca70-3891-5945-98fb-dc099432e06a" + LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" + +[[deps.DataStructures]] +deps = ["Compat", "InteractiveUtils", "OrderedCollections"] +git-tree-sha1 = "1d0a14036acb104d9e89698bd408f63ab58cdc82" +uuid = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8" +version = "0.18.20" + [[deps.Dates]] deps = ["Printf"] uuid = "ade2ca70-3891-5945-98fb-dc099432e06a" @@ -56,11 +78,35 @@ weakdeps = ["BioSequences"] [deps.FASTX.extensions] BioSequencesExt = "BioSequences" +[[deps.GenomicFeatures]] +deps = ["BioGenerics", "DataStructures", "IntervalTrees"] +git-tree-sha1 = "720844f71f118dd2ab4898a89b2b4feca7730d81" +uuid = "899a7d2d-5c61-547b-bef9-6698a8d05446" +version = "3.0.0" + +[[deps.InteractiveUtils]] +deps = ["Markdown"] +uuid = "b77e0a4c-d291-57a0-90e8-8db25a27a240" + +[[deps.IntervalTrees]] +git-tree-sha1 = "dc3b97bb5c9cb7c437f74027309f2c2f09a82aaf" +uuid = "524e6230-43b7-53ae-be76-1e9e4d08d11b" +version = "1.1.0" + [[deps.IterTools]] git-tree-sha1 = "42d5f897009e7ff2cf88db414a389e5ed1bdd023" uuid = "c8e1da08-722c-5040-9ed9-7db0dc04731e" version = "1.10.0" +[[deps.Markdown]] +deps = ["Base64"] +uuid = "d6f4376e-aef5-505a-96c1-9c027394607a" + +[[deps.OrderedCollections]] +git-tree-sha1 = "dfdf5519f235516220579f949664f1bf44e741c5" +uuid = "bac558e1-5e72-5ebc-8fee-abe8a469f55d" +version = "1.6.3" + [[deps.PrecompileTools]] deps = ["Preferences"] git-tree-sha1 = "5aa36f7049a63a1528fe8f7c3f2113413ffd4e1f" @@ -117,6 +163,10 @@ git-tree-sha1 = "29509c4862bfb5da9e76eb6937125ab93986270a" uuid = "7200193e-83a8-5a55-b20d-5d36d44a0795" version = "1.1.2" +[[deps.UUIDs]] +deps = ["Random", "SHA"] +uuid = "cf7118a7-6976-5b1a-9a39-7adc72f591a4" + [[deps.Unicode]] uuid = "4ec0a83e-493e-50e2-b9ac-8f72acf5a8f5" diff --git a/Project.toml b/Project.toml index fecb45f..cae9ace 100644 --- a/Project.toml +++ b/Project.toml @@ -7,6 +7,7 @@ version = "0.4.0" BioMarkovChains = "f861b655-cb5f-42ce-b66a-341b542d4f2c" BioSequences = "7e6ae17a-c86d-528c-b3b9-7f778a29fe59" FASTX = "c2308a5c-f048-11e8-3e8a-31650f418d12" +GenomicFeatures = "899a7d2d-5c61-547b-bef9-6698a8d05446" IterTools = "c8e1da08-722c-5040-9ed9-7db0dc04731e" PrecompileTools = "aea7be01-6a6a-4083-8856-8a6e6704d82a" @@ -14,6 +15,7 @@ PrecompileTools = "aea7be01-6a6a-4083-8856-8a6e6704d82a" BioMarkovChains = "0.9" BioSequences = "3" FASTX = "2" +GenomicFeatures = "3" IterTools = "1.4" PrecompileTools = "1" julia = "1" diff --git a/src/GeneFinder.jl b/src/GeneFinder.jl index 7a8c771..b52e45d 100644 --- a/src/GeneFinder.jl +++ b/src/GeneFinder.jl @@ -25,6 +25,7 @@ using BioMarkovChains: BioMarkovChain, dnaseqprobability, ECOLICDS, ECOLINOCDS, using FASTX: FASTAReader, sequence using IterTools: takewhile, iterated using PrecompileTools: @setup_workload, @compile_workload +using GenomicFeatures: GenomicFeatures, AbstractGenomicInterval, GenomicInterval, Strand, summary, groupname, leftposition, rightposition, strand, metadata, STRAND_POS include("algorithms/naivefinder.jl") include("types.jl") diff --git a/src/algorithms/naivefinder.jl b/src/algorithms/naivefinder.jl index c3dd114..94f7af1 100644 --- a/src/algorithms/naivefinder.jl +++ b/src/algorithms/naivefinder.jl @@ -1,4 +1,6 @@ -export naivefinder, naivefinderscored, log_odds_ratio_decision_rule, lordr +export naivefinder, naivefinderscored, log_odds_ratio_decision_rule, lordr, lors + +const lors = log_odds_ratio_score """ locationiterator(sequence::NucleicSeqOrView{DNAAlphabet{N}}; alternative_start::Bool=false) where {N} @@ -48,11 +50,13 @@ function naivefinder( sequence::NucleicSeqOrView{DNAAlphabet{N}}; alternative_start::Bool = false, min_len::Int64 = 6, + scheme::Union{Nothing, Function} = nothing, kwargs... ) where {N} seqlen = length(sequence) framedict = Dict(0 => 3, 1 => 1, 2 => 2) orfs = Vector{ORF}() + id = 0 for strand in ('+', '-') seq = strand == '-' ? reverse_complement(sequence) : sequence @@ -61,7 +65,8 @@ function naivefinder( frame = strand == '+' ? framedict[location.start % 3] : framedict[(seqlen - location.stop + 1) % 3] start = strand == '+' ? location.start : seqlen - location.stop + 1 stop = start + length(location) - 1 - push!(orfs, ORF(start:stop, strand, frame, 0.0)) + id += 1 + push!(orfs, ORF("$(id)", start:stop, strand, frame, NaiveFinder(), scheme)) end end end @@ -91,12 +96,13 @@ function naivefinderscored( sequence::NucleicSeqOrView{DNAAlphabet{N}}; alternative_start::Bool = false, min_len::Int64 = 6, - scoringscheme::BioMarkovChain = ECOLICDS, # ECOLINOCDS + model::BioMarkovChain = ECOLICDS, kwargs... ) where {N} seqlen = length(sequence) framedict = Dict(0 => 3, 1 => 1, 2 => 2) orfs = Vector{ORF}() + id = 0 for strand in ('+', '-') seq = strand == '-' ? reverse_complement(sequence) : sequence @@ -106,8 +112,9 @@ function naivefinderscored( start = strand == '+' ? location.start : seqlen - location.stop + 1 stop = start + length(location) - 1 # score = -10log10(dnaseqprobability(seq[start:stop], scoringscheme)) - score = log_odds_ratio_score(seq[start:stop], scoringscheme) - push!(orfs, ORF(start:stop, strand, frame, score)) + score = log_odds_ratio_score(seq[start:stop], model) + id += 1 + push!(orfs, ORF("$(id)", start:stop, strand, frame, NaiveFinderScored(), lors, score)) end end end @@ -154,7 +161,7 @@ sequence = dna"ATGGCATCTAG" iscoding(sequence) # Returns: true or false ``` """ -function log_odds_ratio_decision_rule( #log_odds_ratio_decision lordr/cudr/kfdr/aadr +function lordr( #log_odds_ratio_decision, also lordr/cudr/kfdr/aadr sequence::NucleicSeqOrView{DNAAlphabet{N}}; codingmodel::BioMarkovChain = ECOLICDS, noncodingmodel::BioMarkovChain = ECOLINOCDS, @@ -176,7 +183,7 @@ function log_odds_ratio_decision_rule( #log_odds_ratio_decision lordr/cudr/kfdr/ end end -const lordr = log_odds_ratio_decision_rule # criteria +const log_odds_ratio_decision_rule = lordr # criteria ## Another alternative: diff --git a/src/extended.jl b/src/extended.jl index d622866..4d08a56 100644 --- a/src/extended.jl +++ b/src/extended.jl @@ -2,7 +2,9 @@ import Base: isless, iterate, sort, getindex -Base.isless(a::ORF, b::ORF) = isless(a.location, b.location) +# Base.isless(a::ORF, b::ORF) = isless(a.location, b.location) +Base.isless(a::ORF, b::ORF) = isless(a.first:a.last, b.first:b.last) +# Base.isless(a::ORF, b::ORF) = isless(a.location, b.location) # Base.isless(a::ORF, b::ORF) = isless(a.score, b.score) # Base.sort(v::Vector{<:ORF}; kwargs...) = sort(v, by = _orf_sort_key) @@ -12,11 +14,19 @@ Base.isless(a::ORF, b::ORF) = isless(a.location, b.location) #TODOs: how to make more robust the getindex method? confroning the frames? # Base.getindex(sequence::NucleicSeqOrView{A}, orf::ORF) where {A} = orf.strand == '+' ? (@view sequence[orf.location]) : reverse_complement(@view sequence[orf.location]) +# function getindex(sequence::NucleicSeqOrView{A}, orf::ORF) where {A} +# if orf.strand == '+' +# return @view sequence[orf.location] +# else +# return reverse_complement(@view sequence[orf.location]) +# end +# end + function getindex(sequence::NucleicSeqOrView{A}, orf::ORF) where {A} - if orf.strand == '+' - return @view sequence[orf.location] + if orf.strand == '+' || orf.strand == STRAND_POS + return @view sequence[orf.first:orf.last] else - return reverse_complement(@view sequence[orf.location]) + return reverse_complement(@view sequence[orf.first:orf.last]) end end diff --git a/src/findorfs.jl b/src/findorfs.jl index 34a6a38..76d8f38 100644 --- a/src/findorfs.jl +++ b/src/findorfs.jl @@ -54,6 +54,7 @@ function findorfs( end + ### Some possible ideas: # function findorfs( diff --git a/src/types.jl b/src/types.jl index 1f4e5d7..73d6613 100644 --- a/src/types.jl +++ b/src/types.jl @@ -50,23 +50,23 @@ The ORF struct represents an open reading frame in a DNA sequence. It has two fi - `frame`: is an Int type indicating the reading frame of the ORF. The frame is the position of the first nucleotide of the codon that starts the ORF, relative to the start of the sequence. It can be 1, 2, or 3. - `score`: is a Union{Nothing, Float64} type indicating the score of the ORF. It can be a Float64 or nothing. """ -struct ORF <: AbstractGene # could also be mutable so that scores can be updated later, but tests fails - #TODOs: location might be a complex Union allowing UnitRange or a Join of ranges (e.g. 1..100, 200..300)? - location::UnitRange{Int64} # Note that it is also called position for gene struct in GenomicAnotations - strand::Char - frame::Int # Use Int64 instead of Int - score::Union{Nothing, Float64} # Add score field shold be a namedtuple with the score and the method used to score - # rbs::Union{Nothing, Vector{UnitRange{Int64}}} # Add RBS field see https://github.com/deprekate/PHANOTATE/blob/c77e80caef8dc3264f7dc698b087bcd486216bcb/phanotate_modules/functions.py#L48 - - function ORF(location::UnitRange{Int64}, strand::Char, frame::Int, score::Union{Nothing, Float64} = nothing) - @assert frame in (1, 2, 3) "Invalid frame value. Frame must be 1, 2, or 3." - @assert strand in ('+', '-') "Invalid strand value. Strand must be '+' or '-'." - @assert location[1] < location[end] "Invalid location. Start must be less than stop." - @assert score isa Union{Nothing, Float64} "Invalid score value. Score must be a Float64 or nothing." +# struct ORF <: AbstractGene # could also be mutable so that scores can be updated later, but tests fails +# #TODOs: location might be a complex Union allowing UnitRange or a Join of ranges (e.g. 1..100, 200..300)? +# location::UnitRange{Int64} # Note that it is also called position for gene struct in GenomicAnotations +# strand::Char +# frame::Int # Use Int64 instead of Int +# score::Union{Nothing, Float64} # Add score field shold be a namedtuple with the score and the method used to score +# # rbs::Union{Nothing, Vector{UnitRange{Int64}}} # Add RBS field see https://github.com/deprekate/PHANOTATE/blob/c77e80caef8dc3264f7dc698b087bcd486216bcb/phanotate_modules/functions.py#L48 + +# function ORF(location::UnitRange{Int64}, strand::Char, frame::Int, score::Union{Nothing, Float64} = nothing) +# @assert frame in (1, 2, 3) "Invalid frame value. Frame must be 1, 2, or 3." +# @assert strand in ('+', '-') "Invalid strand value. Strand must be '+' or '-'." +# @assert location[1] < location[end] "Invalid location. Start must be less than stop." +# @assert score isa Union{Nothing, Float64} "Invalid score value. Score must be a Float64 or nothing." - return new(location, strand, frame, score) - end -end +# return new(location, strand, frame, score) +# end +# end ## Interfaces for gene finders @@ -80,3 +80,69 @@ struct NaiveFinderScored <: GeneFinderMethod end abstract type GeneScoringScheme end struct NaiveScoringScheme <: GeneScoringScheme end + +export ORF + +struct ORF{F} <: GenomicFeatures.AbstractGenomicInterval{F} + groupname::String + first::Int64 + last::Int64 + strand::Strand + frame::Int + finder::F + scheme::Union{Nothing, Function} + score::Union{Nothing, Float64} + rbs::Union{Nothing, Vector{UnitRange{Int64}}} # Add RBS field +end + +function ORF( + groupname::String, + first::Int64, + last::Int64, + strand::Union{Strand,Char}, + frame::Int, + finder::F, + scheme::Union{Nothing, Function}=nothing, + score::Union{Nothing, Float64}=nothing, + rbs::Union{Nothing, Vector{UnitRange{Int64}}}=nothing +) where {F <: GeneFinderMethod} + return ORF{F}(groupname, first, last, strand, frame, finder, scheme, score, rbs) +end + +function ORF( + id::String, + location::UnitRange{Int64}, + strand::Union{Strand,Char}, + frame::Int, finder::F, + scheme::Union{Nothing, Function}=nothing, + score::Union{Nothing, Float64}=nothing, + rbs::Union{Nothing, Vector{UnitRange{Int64}}}=nothing +) where {F <: GeneFinderMethod} + return ORF{F}(id, first(location), last(location), strand, frame, finder, scheme, score, rbs) +end + +function frame(i::ORF) + return i.frame +end + +function scheme(i::ORF) + return i.scheme +end + +function score(i::ORF) + return i.score +end + +function finder(i::ORF) + return i.finder +end + +function Base.show(io::IO, i::ORF) + if get(io, :compact, false) + print(io, groupname(i), ":", leftposition(i), "-", rightposition(i)) + else + # print(io, "ORF{$(typeof(finder(i))),$(scheme(i))}($(groupname(i)), $(leftposition(i)):$(rightposition(i)), $(strand(i)), $(frame(i)), $(round(score(i), digits=5)))") + print(io, "ORF{$(typeof(finder(i)))}($(groupname(i)), $(leftposition(i)):$(rightposition(i)), $(strand(i)), $(frame(i)), $(score(i)))") + # println(io, " ", groupname(i), " ", leftposition(i), ":", rightposition(i), ", ", strand(i), ", ", frame(i), ", score: $(round(score(i), digits=2))") + end +end \ No newline at end of file From d9cc396cf44bbab0a8c661f526eb5bd7c6ae313e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Camilo=20Garc=C3=ADa?= Date: Fri, 17 May 2024 23:38:02 -0500 Subject: [PATCH 02/40] Some io fixes, and tests --- src/algorithms/naivefinder.jl | 4 +- src/io.jl | 164 +++++++++++++++++----------------- src/iscoding.jl | 4 +- src/types.jl | 3 +- test/findorfstest.jl | 6 +- 5 files changed, 91 insertions(+), 90 deletions(-) diff --git a/src/algorithms/naivefinder.jl b/src/algorithms/naivefinder.jl index 94f7af1..d013e2b 100644 --- a/src/algorithms/naivefinder.jl +++ b/src/algorithms/naivefinder.jl @@ -66,7 +66,7 @@ function naivefinder( start = strand == '+' ? location.start : seqlen - location.stop + 1 stop = start + length(location) - 1 id += 1 - push!(orfs, ORF("$(id)", start:stop, strand, frame, NaiveFinder(), scheme)) + push!(orfs, ORF("ORF$(id)", start:stop, strand, frame, NaiveFinder(), scheme)) end end end @@ -114,7 +114,7 @@ function naivefinderscored( # score = -10log10(dnaseqprobability(seq[start:stop], scoringscheme)) score = log_odds_ratio_score(seq[start:stop], model) id += 1 - push!(orfs, ORF("$(id)", start:stop, strand, frame, NaiveFinderScored(), lors, score)) + push!(orfs, ORF("ORF$(id)", start:stop, strand, frame, NaiveFinderScored(), lors, score)) end end end diff --git a/src/io.jl b/src/io.jl index 131a710..9e0a88f 100644 --- a/src/io.jl +++ b/src/io.jl @@ -18,7 +18,7 @@ Write BED data to a file. - `min_len::Int64=6`: The minimum length that a CDS must have in order to be included in the output file. Default is `6`. """ function write_orfs_bed( - input::NucleicSeqOrView{DNAAlphabet{N}}, + input::Union{String,NucleicSeqOrView{DNAAlphabet{N}}}, output::Union{IOStream, IOBuffer}, method::M; alternative_start::Bool = false, @@ -26,12 +26,12 @@ function write_orfs_bed( ) where {N , M<:GeneFinderMethod} orfs = findorfs(input, method; alternative_start, min_len) @inbounds for orf in orfs - println(output, orf.location.start, "\t", orf.location.stop, "\t", orf.strand, "\t", orf.frame) + println(output, orf.first, "\t", orf.last, "\t", orf.strand, "\t", orf.frame) end end function write_orfs_bed( - input::NucleicSeqOrView{DNAAlphabet{N}}, + input::Union{String,NucleicSeqOrView{DNAAlphabet{N}}}, output::String, method::M; alternative_start::Bool = false, @@ -40,40 +40,40 @@ function write_orfs_bed( orfs = findorfs(input, method; alternative_start, min_len) open(output, "w") do f @inbounds for orf in orfs - write(f, "$(orf.location.start)\t$(orf.location.stop)\t$(orf.strand)\t$(orf.frame)\n") + write(f, "$(orf.first)\t$(orf.last)\t$(orf.strand)\t$(orf.frame)\n") end end end -function write_orfs_bed( - input::String, - output::Union{IOStream, IOBuffer}, - method::M; - alternative_start::Bool = false, - min_len::Int64 = 6 -) where {M<:GeneFinderMethod} - input = fasta_to_dna(input)[1] # rewrite input to be a DNA sequence - orfs = findorfs(input, method; alternative_start, min_len) - @inbounds for orf in orfs - println(output, orf.location.start, "\t", orf.location.stop, "\t", orf.strand, "\t", orf.frame) - end -end - -function write_orfs_bed( - input::String, - output::String, - method::M; - alternative_start::Bool = false, - min_len::Int64 = 6 -) where {M<:GeneFinderMethod} - input = fasta_to_dna(input)[1] - orfs = findorfs(input, method; alternative_start, min_len) - open(output, "w") do f - @inbounds for orf in orfs - write(f, "$(orf.location.start)\t$(orf.location.stop)\t$(orf.strand)\t$(orf.frame)\n") - end - end -end +# function write_orfs_bed( +# input::String, +# output::Union{IOStream, IOBuffer}, +# method::M; +# alternative_start::Bool = false, +# min_len::Int64 = 6 +# ) where {M<:GeneFinderMethod} +# input = fasta_to_dna(input)[1] # rewrite input to be a DNA sequence +# orfs = findorfs(input, method; alternative_start, min_len) +# @inbounds for orf in orfs +# println(output, orf.first, "\t", orf.last, "\t", orf.strand, "\t", orf.frame) +# end +# end + +# function write_orfs_bed( +# input::String, +# output::String, +# method::M; +# alternative_start::Bool = false, +# min_len::Int64 = 6 +# ) where {M<:GeneFinderMethod} +# input = fasta_to_dna(input)[1] +# orfs = findorfs(input, method; alternative_start, min_len) +# open(output, "w") do f +# @inbounds for orf in orfs +# write(f, "$(orf.first)\t$(orf.last)\t$(orf.strand)\t$(orf.frame)\n") +# end +# end +# end """ write_orfs_fna(input::NucleicSeqOrView{DNAAlphabet{N}}, output::Union{IOStream, IOBuffer}, method::M; kwargs...) where {N, M<:GeneFinderMethod} @@ -102,12 +102,12 @@ filename = "output.fna" seq = dna"ATGATGCATGCATGCATGCTAGTAACTAGCTAGCTAGCTAGTAA" open(filename, "w") do file - write_orfs_fna(seq, file) + write_orfs_fna(seq, file, NaiveFinder()) end ``` """ function write_orfs_fna( - input::NucleicSeqOrView{DNAAlphabet{N}}, + input::Union{String, NucleicSeqOrView{DNAAlphabet{N}}}, output::Union{IOStream, IOBuffer}, method::M; alternative_start::Bool = false, @@ -118,13 +118,13 @@ function write_orfs_fna( padding = norfs < 10 ? length(string(norfs)) + 1 : length(string(norfs)) @inbounds for (i, orf) in enumerate(orfs) id = string(lpad(string(i), padding, "0")) - println(output, ">ORF", id, " id=", id, " start=", orf.location.start, " stop=", orf.location.stop, " strand=", orf.strand, " frame=", orf.frame) + println(output, ">", orf.groupname, " id=", id, " start=", orf.first, " stop=", orf.last, " strand=", orf.strand, " frame=", orf.frame, " score=", orf.score) println(output, input[orf]) end end function write_orfs_fna( - input::NucleicSeqOrView{DNAAlphabet{N}}, + input::Union{String,NucleicSeqOrView{DNAAlphabet{N}}}, output::String, method::M; alternative_start::Bool = false, @@ -136,47 +136,47 @@ function write_orfs_fna( open(output, "w") do f for (i, orf) in enumerate(orfs) id = string(lpad(string(i), padding, "0")) - write(f, ">ORF$(id) id=$(id) start=$(orf.location.start) stop=$(orf.location.stop) strand=$(orf.strand) frame=$(orf.frame)\n$(input[orf])\n") # i.strand == '+' ? input[i.location] : reverse_complement(@view input[i.location]) + write(f, ">$(orf.groupname) id=$(id) start=$(orf.first) stop=$(orf.last) strand=$(orf.strand) frame=$(orf.frame) score=$(orf.score)\n$(input[orf])\n") # i.strand == '+' ? input[i.location] : reverse_complement(@view input[i.location]) end end end -function write_orfs_fna( - input::String, - output::Union{IOStream, IOBuffer}, - method::M; - alternative_start::Bool = false, - min_len::Int64 = 6 -) where {M<:GeneFinderMethod} - input = fasta_to_dna(input)[1] - orfs = findorfs(input, method; alternative_start, min_len) - norfs = length(orfs) - padding = norfs < 10 ? length(string(norfs)) + 1 : length(string(norfs)) - @inbounds for (i, orf) in enumerate(orfs) - id = string(lpad(string(i), padding, "0")) - println(output, ">ORF", id, " id=", id, " start=", orf.location.start, " stop=", orf.location.stop, " strand=", orf.strand, " frame=", orf.frame) - println(output, input[orf]) - end -end - -function write_orfs_fna( - input::String, - output::String, - method::M; - alternative_start::Bool = false, - min_len::Int64 = 6 -) where {M<:GeneFinderMethod} - input = fasta_to_dna(input)[1] - orfs = findorfs(input, method; alternative_start, min_len) - norfs = length(orfs) - padding = norfs < 10 ? length(string(norfs)) + 1 : length(string(norfs)) - open(output, "w") do f - for (i, orf) in enumerate(orfs) - id = string(lpad(string(i), padding, "0")) - write(f, ">ORF$(id) id=$(id) start=$(orf.location.start) stop=$(orf.location.stop) strand=$(orf.strand) frame=$(orf.frame)\n$(input[orf])\n") # i.strand == '+' ? input[i.location] : reverse_complement(@view input[i.location]) - end - end -end +# function write_orfs_fna( +# input::String, +# output::Union{IOStream, IOBuffer}, +# method::M; +# alternative_start::Bool = false, +# min_len::Int64 = 6 +# ) where {M<:GeneFinderMethod} +# input = fasta_to_dna(input)[1] +# orfs = findorfs(input, method; alternative_start, min_len) +# norfs = length(orfs) +# padding = norfs < 10 ? length(string(norfs)) + 1 : length(string(norfs)) +# @inbounds for (i, orf) in enumerate(orfs) +# id = string(lpad(string(i), padding, "0")) +# println(output, ">ORF", id, " id=", id, " start=", orf.first, " stop=", orf.last, " strand=", orf.strand, " frame=", orf.frame) +# println(output, input[orf]) +# end +# end + +# function write_orfs_fna( +# input::String, +# output::String, +# method::M; +# alternative_start::Bool = false, +# min_len::Int64 = 6 +# ) where {M<:GeneFinderMethod} +# input = fasta_to_dna(input)[1] +# orfs = findorfs(input, method; alternative_start, min_len) +# norfs = length(orfs) +# padding = norfs < 10 ? length(string(norfs)) + 1 : length(string(norfs)) +# open(output, "w") do f +# for (i, orf) in enumerate(orfs) +# id = string(lpad(string(i), padding, "0")) +# write(f, ">ORF$(id) id=$(id) start=$(orf.first) stop=$(orf.last) strand=$(orf.strand) frame=$(orf.frame)\n$(input[orf])\n") # i.strand == '+' ? input[i.location] : reverse_complement(@view input[i.location]) +# end +# end +# end """ write_orfs_faa(input::NucleicSeqOrView{DNAAlphabet{4}}, output::Union{IOStream, IOBuffer}, method::M; kwargs...) where {N, M<:GeneFinderMethod} @@ -222,7 +222,7 @@ function write_orfs_faa( padding = norfs < 10 ? length(string(norfs)) + 1 : length(string(norfs)) @inbounds for (i, orf) in enumerate(orfs) id = string(lpad(string(i), padding, "0")) - println(output, ">ORF", id, " id=", id, " start=", orf.location.start, " stop=", orf.location.stop, " strand=", orf.strand, " frame=", orf.frame) + println(output, ">ORF", id, " id=", id, " start=", orf.first, " stop=", orf.last, " strand=", orf.strand, " frame=", orf.frame) println(output, translate(input[orf]; code)) end end @@ -241,7 +241,7 @@ function write_orfs_faa( open(output, "w") do f for (i, orf) in enumerate(orfs) id = string(lpad(string(i), padding, "0")) - write(f, ">ORF$(id) id=$(id) start=$(orf.location.start) stop=$(orf.location.stop) strand=$(orf.strand) frame=$(orf.frame)\n$(translate(input[orf]; code))\n") + write(f, ">ORF$(id) id=$(id) start=$(orf.first) stop=$(orf.last) strand=$(orf.strand) frame=$(orf.frame)\n$(translate(input[orf]; code))\n") end end end @@ -260,7 +260,7 @@ function write_orfs_faa( padding = norfs < 10 ? length(string(norfs)) + 1 : length(string(norfs)) @inbounds for (i, orf) in enumerate(orfs) id = string(lpad(string(i), padding, "0")) - println(output, ">ORF", id, " id=", id, " start=", orf.location.start, " stop=", orf.location.stop, " strand=", orf.strand, " frame=", orf.frame) + println(output, ">ORF", id, " id=", id, " start=", orf.first, " stop=", orf.last, " strand=", orf.strand, " frame=", orf.frame) println(output, translate(input[orf]; code)) end end @@ -280,7 +280,7 @@ function write_orfs_faa( open(output, "w") do f for (i, orf) in enumerate(orfs) id = string(lpad(string(i), padding, "0")) - write(f, ">ORF$(id) id=$(id) start=$(orf.location.start) stop=$(orf.location.stop) strand=$(orf.strand) frame=$(orf.frame)\n$(translate(input[orf]; code))\n") + write(f, ">ORF$(id) id=$(id) start=$(orf.first) stop=$(orf.last) strand=$(orf.strand) frame=$(orf.frame)\n$(translate(input[orf]; code))\n") end end end @@ -318,7 +318,7 @@ function write_orfs_gff( println(output, "##gff-version 3\n##sequence-region Chr 1 $(length(input))") for (i, orf) in enumerate(orfs) id = string("ORF", lpad(string(i), padding, "0")) - println(output, "Chr\t.\tORF\t", orf.location.start, "\t", orf.location.stop, "\t.\t", orf.strand, "\t.\tID=", id, ";Name=", id, ";Frame=", orf.frame) + println(output, "Chr\t.\tORF\t", orf.first, "\t", orf.last, "\t.\t", orf.strand, "\t.\tID=", id, ";Name=", id, ";Frame=", orf.frame) end end @@ -338,7 +338,7 @@ function write_orfs_gff( id = string("ORF", lpad(string(i), padding, "0")) write( f, - "Chr\t.\tORF\t$(orf.location.start)\t$(orf.location.stop)\t.\t$(orf.strand)\t.\tID=$(id);Name=$(id);Frame=$(orf.frame)\n", + "Chr\t.\tORF\t$(orf.first)\t$(orf.last)\t.\t$(orf.strand)\t.\tID=$(id);Name=$(id);Frame=$(orf.frame)\n", ) end end @@ -358,7 +358,7 @@ function write_orfs_gff( println(output, "##gff-version 3\n##sequence-region Chr 1 $(length(input))") for (i, orf) in enumerate(orfs) id = string("ORF", lpad(string(i), padding, "0")) - println(output, "Chr\t.\tORF\t", orf.location.start, "\t", orf.location.stop, "\t.\t", orf.strand, "\t.\tID=", id, ";Name=", id, ";Frame=", orf.frame) + println(output, "Chr\t.\tORF\t", orf.first, "\t", orf.last, "\t.\t", orf.strand, "\t.\tID=", id, ";Name=", id, ";Frame=", orf.frame) end end @@ -379,7 +379,7 @@ function write_orfs_gff( id = string("ORF", lpad(string(i), padding, "0")) write( f, - "Chr\t.\tORF\t$(orf.location.start)\t$(orf.location.stop)\t.\t$(orf.strand)\t.\tID=$(id);Name=$(id);Frame=$(orf.frame)\n", + "Chr\t.\tORF\t$(orf.first)\t$(orf.last)\t.\t$(orf.strand)\t.\tID=$(id);Name=$(id);Frame=$(orf.frame)\n", ) end end diff --git a/src/iscoding.jl b/src/iscoding.jl index 61f189c..a0d31c8 100644 --- a/src/iscoding.jl +++ b/src/iscoding.jl @@ -29,10 +29,10 @@ end function iscoding( sequence::NucleicSeqOrView{DNAAlphabet{N}}, - orf::ORF; + orf::ORF{M}; criteria::Function = lordr, kwargs... -) where {N} +) where {N,M <: GeneFinderMethod} return criteria(sequence[orf]; kwargs...) end diff --git a/src/types.jl b/src/types.jl index 73d6613..cbd8eaf 100644 --- a/src/types.jl +++ b/src/types.jl @@ -139,7 +139,8 @@ end function Base.show(io::IO, i::ORF) if get(io, :compact, false) - print(io, groupname(i), ":", leftposition(i), "-", rightposition(i)) + # print(io, groupname(i), ":", leftposition(i), "-", rightposition(i)) + print(io, "ORF{$(typeof(finder(i)))}($(groupname(i)), $(leftposition(i)):$(rightposition(i)), $(strand(i)), $(frame(i)), $(score(i)))") else # print(io, "ORF{$(typeof(finder(i))),$(scheme(i))}($(groupname(i)), $(leftposition(i)):$(rightposition(i)), $(strand(i)), $(frame(i)), $(round(score(i), digits=5)))") print(io, "ORF{$(typeof(finder(i)))}($(groupname(i)), $(leftposition(i)):$(rightposition(i)), $(strand(i)), $(frame(i)), $(score(i)))") diff --git a/test/findorfstest.jl b/test/findorfstest.jl index 3ce7b06..ae7f731 100644 --- a/test/findorfstest.jl +++ b/test/findorfstest.jl @@ -5,7 +5,7 @@ seq01 = dna"ATGATGCATGCATGCATGCTAGTAACTAGCTAGCTAGCTAGTAA" orfs01 = findorfs(seq01, NaiveFinder()) - @test orfs01 == [ORF(1:33, '+', 1, 0.0), ORF(4:33, '+', 1, 0.0), ORF(8:22, '+', 2, 0.0), ORF(12:29, '+', 3, 0.0), ORF(16:33, '+', 1, 0.0)] + @test orfs01 == [ORF{NaiveFinder}(ORF1, 1:33, +, 1, nothing), ORF{NaiveFinder}(ORF2, 4:33, +, 1, nothing), ORF{NaiveFinder}(ORF3, 8:22, +, 2, nothing), ORF{NaiveFinder}(ORF4, 12:29, +, 3, nothing), ORF{NaiveFinder}(ORF5, 16:33, +, 1, nothing)] @test length(orfs01) == 5 # > 180195.SAMN03785337.LFLS01000089 -> finds only 1 gene in Prodigal (from Pyrodigal tests) @@ -13,7 +13,7 @@ orfs02 = findorfs(seq02, NaiveFinder()) @test length(orfs02) == 12 - @test orfs02 == [ORF(29:40, '+', 2, 0.0), ORF(137:145, '+', 2, 0.0), ORF(164:184, '+', 2, 0.0), ORF(173:184, '+', 2, 0.0), ORF(236:241, '+', 2, 0.0), ORF(248:268, '+', 2, 0.0), ORF(362:373, '+', 2, 0.0), ORF(470:496, '+', 2, 0.0), ORF(551:574, '+', 2, 0.0), ORF(569:574, '+', 2, 0.0), ORF(581:601, '+', 2, 0.0), ORF(695:706, '+', 2, 0.0)] + @test orfs02 == [ORF{NaiveFinder}(ORF1, 29:40, +, 2, nothing), ORF{NaiveFinder}(ORF2, 137:145, +, 2, nothing), ORF{NaiveFinder}(ORF3, 164:184, +, 2, nothing), ORF{NaiveFinder}(ORF4, 173:184, +, 2, nothing), ORF{NaiveFinder}(ORF5, 236:241, +, 2, nothing), ORF{NaiveFinder}(ORF6, 248:268, +, 2, nothing), ORF{NaiveFinder}(ORF7, 362:373, +, 2, nothing), ORF{NaiveFinder}(ORF8, 470:496, +, 2, nothing), ORF{NaiveFinder}(ORF9, 551:574, +, 2, nothing), ORF{NaiveFinder}(ORF10, 569:574, +, 2, nothing), ORF{NaiveFinder}(ORF11, 581:601, +, 2, nothing), ORF{NaiveFinder}(ORF12, 695:706, +, 2, nothing)] # From pyrodigal issue #13 link: https://github.com/althonos/pyrodigal/blob/1f939b0913b48dbaa55d574b20e124f1b8323825/pyrodigal/tests/test_orf_finder.py#L271 # Pyrodigal predicts 2 genes from this sequence: @@ -23,7 +23,7 @@ seq03 = dna"TTCGTCAGTCGTTCTGTTTCATTCAATACGATAGTAATGTATTTTTCGTGCATTTCCGGTGGAATCGTGCCGTCCAGCATAGCCTCCAGATATCCCCTTATAGAGGTCAGAGGGGAACGGAAATCGTGGGATACATTGGCTACAAACTTTTTCTGATCATCCTCGGAACGGGCAATTTCGCTTGCCATATAATTCAGACAGGAAGCCAGATAACCGATTTCATCCTCACTATCGACCTGAAATTCATAATGCATATTACCGGCAGCATACTGCTCTGTGGCATGAGTGATCTTCCTCAGAGGAATATATACGATCTCAGTGAAAAAGATCAGAATGATCAGGGATAGCAGGAACAGGATTGCCAGGGTGATATAGGAAATATTCAGCAGGTTGTTACAGGATTTCTGAATATCATTCATATCAGTATGGATGACTACATAGCCTTTTACCTTGTAGTTGGAGGTAATGGGAGCAAATACAGTAAGTACATCCGAATCAAAATTACCGAAGAAATCACCAACAATGTAATAGGAGCCGCTGGTTACGGTCGAATCAAAATTCTCAATGACAACCACATTCTCCACATCTAAGGGACTATTGGTATCCAGTACCAGTCGTCCGGAGGGATTGATGATGCGAATCTCGGAATTCAGGTAGACCGCCAGGGAGTCCAGCTGCATTTTAACGGTCTCCAAAGTTGTTTCACTGGTGTACAATCCGCCGGCATAGGTTCCGGCGATCAGGGTTGCTTCGGAATAGAGACTTTCTGCCTTTTCCCGGATCAGATGTTCTTTGGTCATATTGGGAACAAAAGTTGTAACAATGATGAAACCAAATACACCAAAAATAAAATATGCGAGTATAAATTTTAGATAAAGTGTTTTTTTCATAACAAATCCTGCTTTTGGTATGACTTAATTACGTACTTCGAATTTATAGCCGATGCCCCAGATGGTGCTGATCTTCCAGTTGGCATGATCCTTGATCTTCTC" orfs03 = findorfs(seq03, NaiveFinder(), min_len=75) @test length(orfs03) == 9 - @test orfs03 == [ORF(37:156, '+', 1, 0.0), ORF(194:268, '-', 2, 0.0), ORF(194:283, '-', 2, 0.0), ORF(249:347, '+', 3, 0.0), ORF(426:590, '+', 3, 0.0), ORF(565:657, '+', 1, 0.0), ORF(650:727, '-', 2, 0.0), ORF(786:872, '+', 3, 0.0), ORF(887:976, '-', 2, 0.0)] + @test orfs03 == [ORF{NaiveFinder}(ORF1, 37:156, +, 1, nothing), ORF{NaiveFinder}(ORF9, 194:268, -, 2, nothing), ORF{NaiveFinder}(ORF8, 194:283, -, 2, nothing), ORF{NaiveFinder}(ORF2, 249:347, +, 3, nothing), ORF{NaiveFinder}(ORF3, 426:590, +, 3, nothing), ORF{NaiveFinder}(ORF4, 565:657, +, 1, nothing), ORF{NaiveFinder}(ORF7, 650:727, -, 2, nothing), ORF{NaiveFinder}(ORF5, 786:872, +, 3, nothing), ORF{NaiveFinder}(ORF6, 887:976, -, 2, nothing)] #|-> This occured in Pyrodigal # Lambda phage tests # Compare to https://github.com/jonas-fuchs/viral_orf_finder/blob/master/orf_finder.py From 87b49c3931867965496f2f2f51bdedb78946e439 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Camilo=20Garc=C3=ADa?= Date: Tue, 21 May 2024 17:46:00 -0500 Subject: [PATCH 03/40] Some attempts to correct tests --- src/algorithms/naivefinder.jl | 4 +- src/types.jl | 9 ++- src/utils.jl | 11 +++- test/Project.toml | 2 + test/findorfstest.jl | 102 ++++++++++++++++++---------------- test/getindextest.jl | 9 ++- test/iotest.jl | 68 +++++++++++------------ test/runtests.jl | 26 ++++++++- 8 files changed, 137 insertions(+), 94 deletions(-) diff --git a/src/algorithms/naivefinder.jl b/src/algorithms/naivefinder.jl index d013e2b..4351ed7 100644 --- a/src/algorithms/naivefinder.jl +++ b/src/algorithms/naivefinder.jl @@ -66,7 +66,7 @@ function naivefinder( start = strand == '+' ? location.start : seqlen - location.stop + 1 stop = start + length(location) - 1 id += 1 - push!(orfs, ORF("ORF$(id)", start:stop, strand, frame, NaiveFinder(), scheme)) + push!(orfs, ORF(get_variable_name(sequence), start:stop, strand, frame, NaiveFinder(), scheme)) end end end @@ -114,7 +114,7 @@ function naivefinderscored( # score = -10log10(dnaseqprobability(seq[start:stop], scoringscheme)) score = log_odds_ratio_score(seq[start:stop], model) id += 1 - push!(orfs, ORF("ORF$(id)", start:stop, strand, frame, NaiveFinderScored(), lors, score)) + push!(orfs, ORF(get_variable_name(sequence), start:stop, strand, frame, NaiveFinderScored(), lors, score)) end end end diff --git a/src/types.jl b/src/types.jl index cbd8eaf..68df0f9 100644 --- a/src/types.jl +++ b/src/types.jl @@ -1,6 +1,7 @@ export ORF export NaiveFinder, NaiveFinderScored export NaiveScoringScheme +export finder, frame, scheme, score # Structs associated with gene models abstract type AbstractGene end @@ -81,7 +82,7 @@ abstract type GeneScoringScheme end struct NaiveScoringScheme <: GeneScoringScheme end -export ORF +# export ORF struct ORF{F} <: GenomicFeatures.AbstractGenomicInterval{F} groupname::String @@ -106,6 +107,7 @@ function ORF( score::Union{Nothing, Float64}=nothing, rbs::Union{Nothing, Vector{UnitRange{Int64}}}=nothing ) where {F <: GeneFinderMethod} + # @assert frame in (1, 2, 3) "Invalid frame value. Frame must be 1, 2, or 3." return ORF{F}(groupname, first, last, strand, frame, finder, scheme, score, rbs) end @@ -118,6 +120,7 @@ function ORF( score::Union{Nothing, Float64}=nothing, rbs::Union{Nothing, Vector{UnitRange{Int64}}}=nothing ) where {F <: GeneFinderMethod} + # @assert frame in (1, 2, 3) "Invalid frame value. Frame must be 1, 2, or 3." return ORF{F}(id, first(location), last(location), strand, frame, finder, scheme, score, rbs) end @@ -140,10 +143,10 @@ end function Base.show(io::IO, i::ORF) if get(io, :compact, false) # print(io, groupname(i), ":", leftposition(i), "-", rightposition(i)) - print(io, "ORF{$(typeof(finder(i)))}($(groupname(i)), $(leftposition(i)):$(rightposition(i)), $(strand(i)), $(frame(i)), $(score(i)))") + print(io, "ORF{$(typeof(finder(i)))}($(leftposition(i)):$(rightposition(i)), '$(strand(i))', $(frame(i)), $(score(i)))") else # print(io, "ORF{$(typeof(finder(i))),$(scheme(i))}($(groupname(i)), $(leftposition(i)):$(rightposition(i)), $(strand(i)), $(frame(i)), $(round(score(i), digits=5)))") - print(io, "ORF{$(typeof(finder(i)))}($(groupname(i)), $(leftposition(i)):$(rightposition(i)), $(strand(i)), $(frame(i)), $(score(i)))") + print(io, "ORF{$(typeof(finder(i)))}($(leftposition(i)):$(rightposition(i)), '$(strand(i))', $(frame(i)), $(score(i)))") # println(io, " ", groupname(i), " ", leftposition(i), ":", rightposition(i), ", ", strand(i), ", ", frame(i), ", score: $(round(score(i), digits=2))") end end \ No newline at end of file diff --git a/src/utils.jl b/src/utils.jl index 5214223..404bf9f 100644 --- a/src/utils.jl +++ b/src/utils.jl @@ -1,4 +1,4 @@ -export hasprematurestop, fasta_to_dna +export hasprematurestop, fasta_to_dna, get_variable_name # General purposes methods supporting main functions """ @@ -35,4 +35,13 @@ function fasta_to_dna(input::AbstractString)::Vector{LongSequence{DNAAlphabet{4} FASTAReader(open(input)) do reader return [LongSequence{DNAAlphabet{4}}(sequence(record)) for record in reader] end +end + +function get_variable_name(var) + for name in names(Main) + if getfield(Main, name) === var + return string(name) + end + end + return nothing end \ No newline at end of file diff --git a/test/Project.toml b/test/Project.toml index 60aa073..baa2a68 100644 --- a/test/Project.toml +++ b/test/Project.toml @@ -3,3 +3,5 @@ Aqua = "4c88cf16-eb10-579e-8560-4a9242c79595" BioSequences = "7e6ae17a-c86d-528c-b3b9-7f778a29fe59" FASTX = "c2308a5c-f048-11e8-3e8a-31650f418d12" Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" +TestItemRunner = "f8b46487-2199-4994-9208-9a1283c18c0a" +TestItems = "1c621080-faea-4a02-84b6-bbd5e436b8fe" diff --git a/test/findorfstest.jl b/test/findorfstest.jl index ae7f731..7c9f1cf 100644 --- a/test/findorfstest.jl +++ b/test/findorfstest.jl @@ -1,57 +1,61 @@ -@testset "findorfs" begin - # cd(@__DIR__) - - # A random seq to start - seq01 = dna"ATGATGCATGCATGCATGCTAGTAACTAGCTAGCTAGCTAGTAA" - orfs01 = findorfs(seq01, NaiveFinder()) - - @test orfs01 == [ORF{NaiveFinder}(ORF1, 1:33, +, 1, nothing), ORF{NaiveFinder}(ORF2, 4:33, +, 1, nothing), ORF{NaiveFinder}(ORF3, 8:22, +, 2, nothing), ORF{NaiveFinder}(ORF4, 12:29, +, 3, nothing), ORF{NaiveFinder}(ORF5, 16:33, +, 1, nothing)] - @test length(orfs01) == 5 - - # > 180195.SAMN03785337.LFLS01000089 -> finds only 1 gene in Prodigal (from Pyrodigal tests) - seq02 = dna"AACCAGGGCAATATCAGTACCGCGGGCAATGCAACCCTGACTGCCGGCGGTAACCTGAACAGCACTGGCAATCTGACTGTGGGCGGTGTTACCAACGGCACTGCTACTACTGGCAACATCGCACTGACCGGTAACAATGCGCTGAGCGGTCCGGTCAATCTGAATGCGTCGAATGGCACGGTGACCTTGAACACGACCGGCAATACCACGCTCGGTAACGTGACGGCACAAGGCAATGTGACGACCAATGTGTCCAACGGCAGTCTGACGGTTACCGGCAATACGACAGGTGCCAACACCAACCTCAGTGCCAGCGGCAACCTGACCGTGGGTAACCAGGGCAATATCAGTACCGCAGGCAATGCAACCCTGACGGCCGGCGACAACCTGACGAGCACTGGCAATCTGACTGTGGGCGGCGTCACCAACGGCACGGCCACCACCGGCAACATCGCGCTGACCGGTAACAATGCACTGGCTGGTCCTGTCAATCTGAACGCGCCGAACGGCACCGTGACCCTGAACACAACCGGCAATACCACGCTGGGTAATGTCACCGCACAAGGCAATGTGACGACTAATGTGTCCAACGGCAGCCTGACAGTCGCTGGCAATACCACAGGTGCCAACACCAACCTGAGTGCCAGCGGCAATCTGACCGTGGGCAACCAGGGCAATATCAGTACCGCGGGCAATGCAACCCTGACTGCCGGCGGTAACCTGAGC" - orfs02 = findorfs(seq02, NaiveFinder()) - - @test length(orfs02) == 12 - @test orfs02 == [ORF{NaiveFinder}(ORF1, 29:40, +, 2, nothing), ORF{NaiveFinder}(ORF2, 137:145, +, 2, nothing), ORF{NaiveFinder}(ORF3, 164:184, +, 2, nothing), ORF{NaiveFinder}(ORF4, 173:184, +, 2, nothing), ORF{NaiveFinder}(ORF5, 236:241, +, 2, nothing), ORF{NaiveFinder}(ORF6, 248:268, +, 2, nothing), ORF{NaiveFinder}(ORF7, 362:373, +, 2, nothing), ORF{NaiveFinder}(ORF8, 470:496, +, 2, nothing), ORF{NaiveFinder}(ORF9, 551:574, +, 2, nothing), ORF{NaiveFinder}(ORF10, 569:574, +, 2, nothing), ORF{NaiveFinder}(ORF11, 581:601, +, 2, nothing), ORF{NaiveFinder}(ORF12, 695:706, +, 2, nothing)] - - # From pyrodigal issue #13 link: https://github.com/althonos/pyrodigal/blob/1f939b0913b48dbaa55d574b20e124f1b8323825/pyrodigal/tests/test_orf_finder.py#L271 - # Pyrodigal predicts 2 genes from this sequence: - # 1) An alternative start codon (GTG) sequence at 48:347 - # 2) A common start codon sequence at 426:590 - # On the other hand, the NCBI ORFfinder program predicts 9 ORFs whose length is greater than 75 nt, from which one has an "outbound" stop - seq03 = dna"TTCGTCAGTCGTTCTGTTTCATTCAATACGATAGTAATGTATTTTTCGTGCATTTCCGGTGGAATCGTGCCGTCCAGCATAGCCTCCAGATATCCCCTTATAGAGGTCAGAGGGGAACGGAAATCGTGGGATACATTGGCTACAAACTTTTTCTGATCATCCTCGGAACGGGCAATTTCGCTTGCCATATAATTCAGACAGGAAGCCAGATAACCGATTTCATCCTCACTATCGACCTGAAATTCATAATGCATATTACCGGCAGCATACTGCTCTGTGGCATGAGTGATCTTCCTCAGAGGAATATATACGATCTCAGTGAAAAAGATCAGAATGATCAGGGATAGCAGGAACAGGATTGCCAGGGTGATATAGGAAATATTCAGCAGGTTGTTACAGGATTTCTGAATATCATTCATATCAGTATGGATGACTACATAGCCTTTTACCTTGTAGTTGGAGGTAATGGGAGCAAATACAGTAAGTACATCCGAATCAAAATTACCGAAGAAATCACCAACAATGTAATAGGAGCCGCTGGTTACGGTCGAATCAAAATTCTCAATGACAACCACATTCTCCACATCTAAGGGACTATTGGTATCCAGTACCAGTCGTCCGGAGGGATTGATGATGCGAATCTCGGAATTCAGGTAGACCGCCAGGGAGTCCAGCTGCATTTTAACGGTCTCCAAAGTTGTTTCACTGGTGTACAATCCGCCGGCATAGGTTCCGGCGATCAGGGTTGCTTCGGAATAGAGACTTTCTGCCTTTTCCCGGATCAGATGTTCTTTGGTCATATTGGGAACAAAAGTTGTAACAATGATGAAACCAAATACACCAAAAATAAAATATGCGAGTATAAATTTTAGATAAAGTGTTTTTTTCATAACAAATCCTGCTTTTGGTATGACTTAATTACGTACTTCGAATTTATAGCCGATGCCCCAGATGGTGCTGATCTTCCAGTTGGCATGATCCTTGATCTTCTC" - orfs03 = findorfs(seq03, NaiveFinder(), min_len=75) - @test length(orfs03) == 9 - @test orfs03 == [ORF{NaiveFinder}(ORF1, 37:156, +, 1, nothing), ORF{NaiveFinder}(ORF9, 194:268, -, 2, nothing), ORF{NaiveFinder}(ORF8, 194:283, -, 2, nothing), ORF{NaiveFinder}(ORF2, 249:347, +, 3, nothing), ORF{NaiveFinder}(ORF3, 426:590, +, 3, nothing), ORF{NaiveFinder}(ORF4, 565:657, +, 1, nothing), ORF{NaiveFinder}(ORF7, 650:727, -, 2, nothing), ORF{NaiveFinder}(ORF5, 786:872, +, 3, nothing), ORF{NaiveFinder}(ORF6, 887:976, -, 2, nothing)] - #|-> This occured in Pyrodigal - # Lambda phage tests - # Compare to https://github.com/jonas-fuchs/viral_orf_finder/blob/master/orf_finder.py - # Salisbury and Tsorukas (2019) paper used the Lambda phage genome with 73 CDS and 545 non-CDS ORFs (a total of 618) to compare predictions between several Gene Finder programs - # For a minimal length of 75 nt the following ORFs are predicted: - # orf_finder.py --> 885 (222 complete) - # findorfs (GeneFinder.jl) --> 885 - # NCBI ORFfinder --> 375 ORFs - # orfipy --> 375 (`orfipy NC_001416.1.fasta --start ATG --include-stop --min 75`) - NC_001416 = fasta_to_dna("data/NC_001416.1.fasta")[1] - NC_001416_orfs = findorfs(NC_001416, NaiveFinder(), min_len=75) - @test length(NC_001416_orfs) == 885 -end - -@testset "getorfs" begin +# @testset "findorfs" begin +# # cd(@__DIR__) + +# # A random seq to start +# seq01 = dna"ATGATGCATGCATGCATGCTAGTAACTAGCTAGCTAGCTAGTAA" +# orfs01 = findorfs(seq01, NaiveFinder()) + +# @test orfs01 == [ORF{NaiveFinder}(1:33, '+', 1, nothing), ORF{NaiveFinder}(4:33, '+', 1, nothing), ORF{NaiveFinder}(8:22, '+', 2, nothing), ORF{NaiveFinder}(12:29, '+', 3, nothing), ORF{NaiveFinder}(16:33, '+', 1, nothing)] +# @test length(orfs01) == 5 + +# # > 180195.SAMN03785337.LFLS01000089 -> finds only 1 gene in Prodigal (from Pyrodigal tests) +# seq02 = dna"AACCAGGGCAATATCAGTACCGCGGGCAATGCAACCCTGACTGCCGGCGGTAACCTGAACAGCACTGGCAATCTGACTGTGGGCGGTGTTACCAACGGCACTGCTACTACTGGCAACATCGCACTGACCGGTAACAATGCGCTGAGCGGTCCGGTCAATCTGAATGCGTCGAATGGCACGGTGACCTTGAACACGACCGGCAATACCACGCTCGGTAACGTGACGGCACAAGGCAATGTGACGACCAATGTGTCCAACGGCAGTCTGACGGTTACCGGCAATACGACAGGTGCCAACACCAACCTCAGTGCCAGCGGCAACCTGACCGTGGGTAACCAGGGCAATATCAGTACCGCAGGCAATGCAACCCTGACGGCCGGCGACAACCTGACGAGCACTGGCAATCTGACTGTGGGCGGCGTCACCAACGGCACGGCCACCACCGGCAACATCGCGCTGACCGGTAACAATGCACTGGCTGGTCCTGTCAATCTGAACGCGCCGAACGGCACCGTGACCCTGAACACAACCGGCAATACCACGCTGGGTAATGTCACCGCACAAGGCAATGTGACGACTAATGTGTCCAACGGCAGCCTGACAGTCGCTGGCAATACCACAGGTGCCAACACCAACCTGAGTGCCAGCGGCAATCTGACCGTGGGCAACCAGGGCAATATCAGTACCGCGGGCAATGCAACCCTGACTGCCGGCGGTAACCTGAGC" +# orfs02 = findorfs(seq02, NaiveFinder()) + +# @test length(orfs02) == 12 +# @test orfs02 == [ORF{NaiveFinder}(29:40, '+', 2, nothing), ORF{NaiveFinder}(137:145, '+', 2, nothing), ORF{NaiveFinder}(164:184, '+', 2, nothing), ORF{NaiveFinder}(173:184, '+', 2, nothing), ORF{NaiveFinder}(236:241, '+', 2, nothing), ORF{NaiveFinder}(248:268, '+', 2, nothing), ORF{NaiveFinder}(362:373, '+', 2, nothing), ORF{NaiveFinder}(470:496, '+', 2, nothing), ORF{NaiveFinder}(551:574, '+', 2, nothing), ORF{NaiveFinder}(569:574, '+', 2, nothing), ORF{NaiveFinder}(581:601, '+', 2, nothing), ORF{NaiveFinder}(695:706, '+', 2, nothing)] + +# # From pyrodigal issue #13 link: https://github.com/althonos/pyrodigal/blob/1f939b0913b48dbaa55d574b20e124f1b8323825/pyrodigal/tests/test_orf_finder.py#L271 +# # Pyrodigal predicts 2 genes from this sequence: +# # 1) An alternative start codon (GTG) sequence at 48:347 +# # 2) A common start codon sequence at 426:590 +# # On the other hand, the NCBI ORFfinder program predicts 9 ORFs whose length is greater than 75 nt, from which one has an "outbound" stop +# seq03 = dna"TTCGTCAGTCGTTCTGTTTCATTCAATACGATAGTAATGTATTTTTCGTGCATTTCCGGTGGAATCGTGCCGTCCAGCATAGCCTCCAGATATCCCCTTATAGAGGTCAGAGGGGAACGGAAATCGTGGGATACATTGGCTACAAACTTTTTCTGATCATCCTCGGAACGGGCAATTTCGCTTGCCATATAATTCAGACAGGAAGCCAGATAACCGATTTCATCCTCACTATCGACCTGAAATTCATAATGCATATTACCGGCAGCATACTGCTCTGTGGCATGAGTGATCTTCCTCAGAGGAATATATACGATCTCAGTGAAAAAGATCAGAATGATCAGGGATAGCAGGAACAGGATTGCCAGGGTGATATAGGAAATATTCAGCAGGTTGTTACAGGATTTCTGAATATCATTCATATCAGTATGGATGACTACATAGCCTTTTACCTTGTAGTTGGAGGTAATGGGAGCAAATACAGTAAGTACATCCGAATCAAAATTACCGAAGAAATCACCAACAATGTAATAGGAGCCGCTGGTTACGGTCGAATCAAAATTCTCAATGACAACCACATTCTCCACATCTAAGGGACTATTGGTATCCAGTACCAGTCGTCCGGAGGGATTGATGATGCGAATCTCGGAATTCAGGTAGACCGCCAGGGAGTCCAGCTGCATTTTAACGGTCTCCAAAGTTGTTTCACTGGTGTACAATCCGCCGGCATAGGTTCCGGCGATCAGGGTTGCTTCGGAATAGAGACTTTCTGCCTTTTCCCGGATCAGATGTTCTTTGGTCATATTGGGAACAAAAGTTGTAACAATGATGAAACCAAATACACCAAAAATAAAATATGCGAGTATAAATTTTAGATAAAGTGTTTTTTTCATAACAAATCCTGCTTTTGGTATGACTTAATTACGTACTTCGAATTTATAGCCGATGCCCCAGATGGTGCTGATCTTCCAGTTGGCATGATCCTTGATCTTCTC" +# orfs03 = findorfs(seq03, NaiveFinder(), min_len=75) +# @test length(orfs03) == 9 +# @test orfs03 == [ORF{NaiveFinder}(37:156, '+', 1, nothing), ORF{NaiveFinder}(194:268, '-', 2, nothing), ORF{NaiveFinder}(194:283, '-', 2, nothing), ORF{NaiveFinder}(249:347, '+', 3, nothing), ORF{NaiveFinder}(426:590, '+', 3, nothing), ORF{NaiveFinder}(565:657, '+', 1, nothing), ORF{NaiveFinder}(650:727, '-', 2, nothing), ORF{NaiveFinder}(786:872, '+', 3, nothing), ORF{NaiveFinder}(887:976, '-', 2, nothing)] +# #|-> This occured in Pyrodigal +# # Lambda phage tests +# # Compare to https://github.com/jonas-fuchs/viral_orf_finder/blob/master/orf_finder.py +# # Salisbury and Tsorukas (2019) paper used the Lambda phage genome with 73 CDS and 545 non-CDS ORFs (a total of 618) to compare predictions between several Gene Finder programs +# # For a minimal length of 75 nt the following ORFs are predicted: +# # orf_finder.py --> 885 (222 complete) +# # findorfs (GeneFinder.jl) --> 885 +# # NCBI ORFfinder --> 375 ORFs +# # orfipy --> 375 (`orfipy NC_001416.1.fasta --start ATG --include-stop --min 75`) +# NC_001416 = fasta_to_dna("data/NC_001416.1.fasta")[1] +# NC_001416_orfs = findorfs(NC_001416, NaiveFinder(), min_len=75) +# @test length(NC_001416_orfs) == 885 +# end +@testitem "getorfs dna" default_imports=false begin + + using BioSequences: @dna_str, DNAAlphabet + using GeneFinder: NaiveFinder, getorfs, findorfs, ORF + using Test: @test seq01 = dna"ATGATGCATGCATGCATGCTAGTAACTAGCTAGCTAGCTAGTAA" orfseqs = getorfs(seq01, DNAAlphabet{4}(), NaiveFinder()) - @test length(orfseqs) == 5 - @test orfseqs[1] == dna"ATGATGCATGCATGCATGCTAGTAACTAGCTAG" + # @test length(orfseqs) == 5 + # @test orfseqs[1] == dna"ATGATGCATGCATGCATGCTAGTAACTAGCTAG" end -@testset "getorfs" begin +# @testitem "getorfs proteins" begin +# using BioSequences, GeneFinder, Test - seq01 = dna"ATGATGCATGCATGCATGCTAGTAACTAGCTAGCTAGCTAGTAA" - aas = getorfs(seq01, AminoAcidAlphabet(), NaiveFinder()) +# seq01 = dna"ATGATGCATGCATGCATGCTAGTAACTAGCTAGCTAGCTAGTAA" +# aas = getorfs(seq01, AminoAcidAlphabet(), NaiveFinder()) - @test length(aas) == 5 - @test aas[1] == aa"MMHACMLVTS*" -end \ No newline at end of file +# @test length(aas) == 5 +# @test aas[1] == aa"MMHACMLVTS*" +# end \ No newline at end of file diff --git a/test/getindextest.jl b/test/getindextest.jl index e961b89..3ab5f71 100644 --- a/test/getindextest.jl +++ b/test/getindextest.jl @@ -1,10 +1,15 @@ # Import the function to be tested # Run the tests -@testset "getindex" begin +@testitem "getindex" default_imports=false begin + + using BioSequences: @dna_str + using GeneFinder: ORF, NaiveFinder + using Test: @test + seq01 = dna"ATGATGCATGCATGCATGCTAGTAACTAGCTAGCTAGCTAGTAA" - orfs01 = [ORF(1:33, '+', 1, 0.0), ORF(4:33, '+', 1, 0.0), ORF(8:22, '+', 2, 0.0), ORF(12:29, '+', 3, 0.0), ORF(16:33, '+', 1, 0.0)] + orfs01 = [ORF("seq01", 1:33, '+', 1, NaiveFinder()), ORF("seq01", 4:33, '+', 1, NaiveFinder()), ORF("seq01", 8:22, '+', 2, NaiveFinder()), ORF("seq01", 12:29, '+', 3, NaiveFinder()), ORF("seq01", 16:33, '+', 1, NaiveFinder())] @test getindex(seq01, orfs01[1]) == dna"ATGATGCATGCATGCATGCTAGTAACTAGCTAG" @test getindex(seq01, orfs01[2]) == dna"ATGCATGCATGCATGCTAGTAACTAGCTAG" diff --git a/test/iotest.jl b/test/iotest.jl index 598c5ed..f7ed3da 100644 --- a/test/iotest.jl +++ b/test/iotest.jl @@ -1,48 +1,48 @@ # Test the write_orfs_* methods -@testset "write_orfs_f* " begin +# @testset "write_orfs_f* " begin - # seq01 = dna"ATGATGCATGCATGCATGCTAGTAACTAGCTAGCTAGCTAGTAA" - # seq02 = dna"AACCAGGGCAATATCAGTACCGCGGGCAATGCAACCCTGACTGCCGGCGGTAACCTGAACAGCACTGGCAATCTGACTGTGGGCGGTGTTACCAACGGCACTGCTACTACTGGCAACATCGCACTGACCGGTAACAATGCGCTGAGCGGTCCGGTCAATCTGAATGCGTCGAATGGCACGGTGACCTTGAACACGACCGGCAATACCACGCTCGGTAACGTGACGGCACAAGGCAATGTGACGACCAATGTGTCCAACGGCAGTCTGACGGTTACCGGCAATACGACAGGTGCCAACACCAACCTCAGTGCCAGCGGCAACCTGACCGTGGGTAACCAGGGCAATATCAGTACCGCAGGCAATGCAACCCTGACGGCCGGCGACAACCTGACGAGCACTGGCAATCTGACTGTGGGCGGCGTCACCAACGGCACGGCCACCACCGGCAACATCGCGCTGACCGGTAACAATGCACTGGCTGGTCCTGTCAATCTGAACGCGCCGAACGGCACCGTGACCCTGAACACAACCGGCAATACCACGCTGGGTAATGTCACCGCACAAGGCAATGTGACGACTAATGTGTCCAACGGCAGCCTGACAGTCGCTGGCAATACCACAGGTGCCAACACCAACCTGAGTGCCAGCGGCAATCTGACCGTGGGCAACCAGGGCAATATCAGTACCGCGGGCAATGCAACCCTGACTGCCGGCGGTAACCTGAGC" +# # seq01 = dna"ATGATGCATGCATGCATGCTAGTAACTAGCTAGCTAGCTAGTAA" +# # seq02 = dna"AACCAGGGCAATATCAGTACCGCGGGCAATGCAACCCTGACTGCCGGCGGTAACCTGAACAGCACTGGCAATCTGACTGTGGGCGGTGTTACCAACGGCACTGCTACTACTGGCAACATCGCACTGACCGGTAACAATGCGCTGAGCGGTCCGGTCAATCTGAATGCGTCGAATGGCACGGTGACCTTGAACACGACCGGCAATACCACGCTCGGTAACGTGACGGCACAAGGCAATGTGACGACCAATGTGTCCAACGGCAGTCTGACGGTTACCGGCAATACGACAGGTGCCAACACCAACCTCAGTGCCAGCGGCAACCTGACCGTGGGTAACCAGGGCAATATCAGTACCGCAGGCAATGCAACCCTGACGGCCGGCGACAACCTGACGAGCACTGGCAATCTGACTGTGGGCGGCGTCACCAACGGCACGGCCACCACCGGCAACATCGCGCTGACCGGTAACAATGCACTGGCTGGTCCTGTCAATCTGAACGCGCCGAACGGCACCGTGACCCTGAACACAACCGGCAATACCACGCTGGGTAATGTCACCGCACAAGGCAATGTGACGACTAATGTGTCCAACGGCAGCCTGACAGTCGCTGGCAATACCACAGGTGCCAACACCAACCTGAGTGCCAGCGGCAATCTGACCGTGGGCAACCAGGGCAATATCAGTACCGCGGGCAATGCAACCCTGACTGCCGGCGGTAACCTGAGC" - # Test case 1 +# # Test case 1 - # From pyrodigal issue #13 link: https://github.com/althonos/pyrodigal/blob/1f939b0913b48dbaa55d574b20e124f1b8323825/pyrodigal/tests/test_orf_finder.py#L271 - # Pyrodigal predicts 2 genes from this sequence: - # 1) An alternative start codon (GTG) sequence at 48:347 - # 2) A common start codon sequence at 426:590 - # On the other hand, the NCBI ORFfinder program predicts 9 ORFs whose length is greater than 75 nt, from which one has an "outbound" stop - seq03 = dna"TTCGTCAGTCGTTCTGTTTCATTCAATACGATAGTAATGTATTTTTCGTGCATTTCCGGTGGAATCGTGCCGTCCAGCATAGCCTCCAGATATCCCCTTATAGAGGTCAGAGGGGAACGGAAATCGTGGGATACATTGGCTACAAACTTTTTCTGATCATCCTCGGAACGGGCAATTTCGCTTGCCATATAATTCAGACAGGAAGCCAGATAACCGATTTCATCCTCACTATCGACCTGAAATTCATAATGCATATTACCGGCAGCATACTGCTCTGTGGCATGAGTGATCTTCCTCAGAGGAATATATACGATCTCAGTGAAAAAGATCAGAATGATCAGGGATAGCAGGAACAGGATTGCCAGGGTGATATAGGAAATATTCAGCAGGTTGTTACAGGATTTCTGAATATCATTCATATCAGTATGGATGACTACATAGCCTTTTACCTTGTAGTTGGAGGTAATGGGAGCAAATACAGTAAGTACATCCGAATCAAAATTACCGAAGAAATCACCAACAATGTAATAGGAGCCGCTGGTTACGGTCGAATCAAAATTCTCAATGACAACCACATTCTCCACATCTAAGGGACTATTGGTATCCAGTACCAGTCGTCCGGAGGGATTGATGATGCGAATCTCGGAATTCAGGTAGACCGCCAGGGAGTCCAGCTGCATTTTAACGGTCTCCAAAGTTGTTTCACTGGTGTACAATCCGCCGGCATAGGTTCCGGCGATCAGGGTTGCTTCGGAATAGAGACTTTCTGCCTTTTCCCGGATCAGATGTTCTTTGGTCATATTGGGAACAAAAGTTGTAACAATGATGAAACCAAATACACCAAAAATAAAATATGCGAGTATAAATTTTAGATAAAGTGTTTTTTTCATAACAAATCCTGCTTTTGGTATGACTTAATTACGTACTTCGAATTTATAGCCGATGCCCCAGATGGTGCTGATCTTCCAGTTGGCATGATCCTTGATCTTCTC" +# # From pyrodigal issue #13 link: https://github.com/althonos/pyrodigal/blob/1f939b0913b48dbaa55d574b20e124f1b8323825/pyrodigal/tests/test_orf_finder.py#L271 +# # Pyrodigal predicts 2 genes from this sequence: +# # 1) An alternative start codon (GTG) sequence at 48:347 +# # 2) A common start codon sequence at 426:590 +# # On the other hand, the NCBI ORFfinder program predicts 9 ORFs whose length is greater than 75 nt, from which one has an "outbound" stop +# seq03 = dna"TTCGTCAGTCGTTCTGTTTCATTCAATACGATAGTAATGTATTTTTCGTGCATTTCCGGTGGAATCGTGCCGTCCAGCATAGCCTCCAGATATCCCCTTATAGAGGTCAGAGGGGAACGGAAATCGTGGGATACATTGGCTACAAACTTTTTCTGATCATCCTCGGAACGGGCAATTTCGCTTGCCATATAATTCAGACAGGAAGCCAGATAACCGATTTCATCCTCACTATCGACCTGAAATTCATAATGCATATTACCGGCAGCATACTGCTCTGTGGCATGAGTGATCTTCCTCAGAGGAATATATACGATCTCAGTGAAAAAGATCAGAATGATCAGGGATAGCAGGAACAGGATTGCCAGGGTGATATAGGAAATATTCAGCAGGTTGTTACAGGATTTCTGAATATCATTCATATCAGTATGGATGACTACATAGCCTTTTACCTTGTAGTTGGAGGTAATGGGAGCAAATACAGTAAGTACATCCGAATCAAAATTACCGAAGAAATCACCAACAATGTAATAGGAGCCGCTGGTTACGGTCGAATCAAAATTCTCAATGACAACCACATTCTCCACATCTAAGGGACTATTGGTATCCAGTACCAGTCGTCCGGAGGGATTGATGATGCGAATCTCGGAATTCAGGTAGACCGCCAGGGAGTCCAGCTGCATTTTAACGGTCTCCAAAGTTGTTTCACTGGTGTACAATCCGCCGGCATAGGTTCCGGCGATCAGGGTTGCTTCGGAATAGAGACTTTCTGCCTTTTCCCGGATCAGATGTTCTTTGGTCATATTGGGAACAAAAGTTGTAACAATGATGAAACCAAATACACCAAAAATAAAATATGCGAGTATAAATTTTAGATAAAGTGTTTTTTTCATAACAAATCCTGCTTTTGGTATGACTTAATTACGTACTTCGAATTTATAGCCGATGCCCCAGATGGTGCTGATCTTCCAGTTGGCATGATCCTTGATCTTCTC" - seq03fna = "data/out-seq03.fna" - open(seq03fna, "w") do io - write_orfs_fna(seq03, io, NaiveFinder()) - end +# seq03fna = "data/out-seq03.fna" +# open(seq03fna, "w") do io +# write_orfs_fna(seq03, io, NaiveFinder()) +# end - seq03fnarecords = open(collect, FASTAReader, "data/out-seq03.fna") +# seq03fnarecords = open(collect, FASTAReader, "data/out-seq03.fna") - @test seq03fnarecords[1] == FASTX.FASTA.Record("ORF01 id=01 start=5 stop=22 strand=- frame=2", "ATGAAACAGAACGACTGA") - @test length(seq03fnarecords) == 32 - @test identifier(seq03fnarecords[1]) == "ORF01" - @test description(seq03fnarecords[1]) == "ORF01 id=01 start=5 stop=22 strand=- frame=2" - @test sequence(seq03fnarecords[1]) == "ATGAAACAGAACGACTGA" +# @test seq03fnarecords[1] == FASTX.FASTA.Record("ORF01 id=01 start=5 stop=22 strand=- frame=2", "ATGAAACAGAACGACTGA") +# @test length(seq03fnarecords) == 32 +# @test identifier(seq03fnarecords[1]) == "ORF01" +# @test description(seq03fnarecords[1]) == "ORF01 id=01 start=5 stop=22 strand=- frame=2" +# @test sequence(seq03fnarecords[1]) == "ATGAAACAGAACGACTGA" - # Test case 2 +# # Test case 2 - seq03faa = "data/out-seq03.faa" - open(seq03faa, "w") do io - write_orfs_faa(seq03, io, NaiveFinder()) - end +# seq03faa = "data/out-seq03.faa" +# open(seq03faa, "w") do io +# write_orfs_faa(seq03, io, NaiveFinder()) +# end - seq03faarecords = open(collect, FASTAReader, "data/out-seq03.faa") +# seq03faarecords = open(collect, FASTAReader, "data/out-seq03.faa") - @test seq03faarecords[2] == FASTX.FASTA.Record("ORF02 id=02 start=37 stop=156 strand=+ frame=1", "MYFSCISGGIVPSSIASRYPLIEVRGERKSWDTLATNFF*") - @test length(seq03faarecords) == 32 - @test identifier(seq03faarecords[2]) == "ORF02" - @test description(seq03faarecords[2]) == "ORF02 id=02 start=37 stop=156 strand=+ frame=1" - @test sequence(seq03faarecords[2]) == "MYFSCISGGIVPSSIASRYPLIEVRGERKSWDTLATNFF*" +# @test seq03faarecords[2] == FASTX.FASTA.Record("ORF02 id=02 start=37 stop=156 strand=+ frame=1", "MYFSCISGGIVPSSIASRYPLIEVRGERKSWDTLATNFF*") +# @test length(seq03faarecords) == 32 +# @test identifier(seq03faarecords[2]) == "ORF02" +# @test description(seq03faarecords[2]) == "ORF02 id=02 start=37 stop=156 strand=+ frame=1" +# @test sequence(seq03faarecords[2]) == "MYFSCISGGIVPSSIASRYPLIEVRGERKSWDTLATNFF*" - # Test case 3 +# # Test case 3 - @test seq03faarecords[3] == FASTX.FASTA.Record("ORF03 id=03 start=107 stop=136 strand=- frame=2", "MYPTISVPL*") +# @test seq03faarecords[3] == FASTX.FASTA.Record("ORF03 id=03 start=107 stop=136 strand=- frame=2", "MYPTISVPL*") -end +# end \ No newline at end of file diff --git a/test/runtests.jl b/test/runtests.jl index d89a385..764a090 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -3,12 +3,32 @@ module GeneFinderTests using Test using BioSequences using FASTX -using GeneFinder +# using GeneFinder using Aqua +using TestItems +using TestItemRunner + +@run_package_tests include("findorfstest.jl") -include("iotest.jl") +# include("iotest.jl") include("getindextest.jl") -include("aquatest.jl") +# include("aquatest.jl") + + + +@testitem "getorfs dna" default_imports=false begin + + using BioSequences: @dna_str, DNAAlphabet + using GeneFinder: NaiveFinder, getorfs, findorfs, ORF + using Test: @test + + seq01 = dna"ATGATGCATGCATGCATGCTAGTAACTAGCTAGCTAGCTAGTAA" + orfseqs = getorfs(seq01, DNAAlphabet{4}(), NaiveFinder()) + + # @test length(orfseqs) == 5 + # @test orfseqs[1] == dna"ATGATGCATGCATGCATGCTAGTAACTAGCTAG" +end + end From 9132e110496e0658fef945a8712d53bea2cdd15a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Camilo=20Garc=C3=ADa?= Date: Thu, 23 May 2024 17:43:52 -0500 Subject: [PATCH 04/40] refactor: Remove unused code in runtests.jl --- test/runtests.jl | 16 ---------------- 1 file changed, 16 deletions(-) diff --git a/test/runtests.jl b/test/runtests.jl index 764a090..3bdd666 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -15,20 +15,4 @@ include("findorfstest.jl") include("getindextest.jl") # include("aquatest.jl") - - -@testitem "getorfs dna" default_imports=false begin - - using BioSequences: @dna_str, DNAAlphabet - using GeneFinder: NaiveFinder, getorfs, findorfs, ORF - using Test: @test - - seq01 = dna"ATGATGCATGCATGCATGCTAGTAACTAGCTAGCTAGCTAGTAA" - orfseqs = getorfs(seq01, DNAAlphabet{4}(), NaiveFinder()) - - # @test length(orfseqs) == 5 - # @test orfseqs[1] == dna"ATGATGCATGCATGCATGCTAGTAACTAGCTAG" -end - - end From 8e138bada34058ce180ed3fcd91e929741b5bba5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Camilo=20Garc=C3=ADa?= Date: Thu, 23 May 2024 17:45:57 -0500 Subject: [PATCH 05/40] Relax some tests --- test/findorfstest.jl | 18 +++++++++--------- test/runtests.jl | 2 +- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/test/findorfstest.jl b/test/findorfstest.jl index 7c9f1cf..2eb9081 100644 --- a/test/findorfstest.jl +++ b/test/findorfstest.jl @@ -37,18 +37,18 @@ # NC_001416_orfs = findorfs(NC_001416, NaiveFinder(), min_len=75) # @test length(NC_001416_orfs) == 885 # end -@testitem "getorfs dna" default_imports=false begin +# @testitem "getorfs dna" default_imports=false begin - using BioSequences: @dna_str, DNAAlphabet - using GeneFinder: NaiveFinder, getorfs, findorfs, ORF - using Test: @test +# using BioSequences: @dna_str, DNAAlphabet +# using GeneFinder: NaiveFinder, getorfs, findorfs, ORF +# using Test: @test - seq01 = dna"ATGATGCATGCATGCATGCTAGTAACTAGCTAGCTAGCTAGTAA" - orfseqs = getorfs(seq01, DNAAlphabet{4}(), NaiveFinder()) +# seq01 = dna"ATGATGCATGCATGCATGCTAGTAACTAGCTAGCTAGCTAGTAA" +# orfseqs = getorfs(seq01, DNAAlphabet{4}(), NaiveFinder()) - # @test length(orfseqs) == 5 - # @test orfseqs[1] == dna"ATGATGCATGCATGCATGCTAGTAACTAGCTAG" -end +# # @test length(orfseqs) == 5 +# # @test orfseqs[1] == dna"ATGATGCATGCATGCATGCTAGTAACTAGCTAG" +# end # @testitem "getorfs proteins" begin # using BioSequences, GeneFinder, Test diff --git a/test/runtests.jl b/test/runtests.jl index 3bdd666..34f222c 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -10,7 +10,7 @@ using TestItemRunner @run_package_tests -include("findorfstest.jl") +# include("findorfstest.jl") # include("iotest.jl") include("getindextest.jl") # include("aquatest.jl") From b4123558ae4be2cdb3625c464e76f9192297e037 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Camilo=20Garc=C3=ADa?= Date: Sat, 25 May 2024 12:50:10 -0500 Subject: [PATCH 06/40] Move get_variable_name outside loop --- src/algorithms/naivefinder.jl | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/src/algorithms/naivefinder.jl b/src/algorithms/naivefinder.jl index 4351ed7..e02c11d 100644 --- a/src/algorithms/naivefinder.jl +++ b/src/algorithms/naivefinder.jl @@ -56,17 +56,15 @@ function naivefinder( seqlen = length(sequence) framedict = Dict(0 => 3, 1 => 1, 2 => 2) orfs = Vector{ORF}() - id = 0 + seqname = get_variable_name(sequence) for strand in ('+', '-') seq = strand == '-' ? reverse_complement(sequence) : sequence - @inbounds for location in @views _locationiterator(seq; alternative_start) if length(location) >= min_len frame = strand == '+' ? framedict[location.start % 3] : framedict[(seqlen - location.stop + 1) % 3] start = strand == '+' ? location.start : seqlen - location.stop + 1 stop = start + length(location) - 1 - id += 1 - push!(orfs, ORF(get_variable_name(sequence), start:stop, strand, frame, NaiveFinder(), scheme)) + push!(orfs, ORF(seqname, start:stop, strand, frame, scheme)) #NaiveFinder() end end end @@ -102,7 +100,7 @@ function naivefinderscored( seqlen = length(sequence) framedict = Dict(0 => 3, 1 => 1, 2 => 2) orfs = Vector{ORF}() - id = 0 + seqname = get_variable_name(sequence) for strand in ('+', '-') seq = strand == '-' ? reverse_complement(sequence) : sequence @@ -113,8 +111,7 @@ function naivefinderscored( stop = start + length(location) - 1 # score = -10log10(dnaseqprobability(seq[start:stop], scoringscheme)) score = log_odds_ratio_score(seq[start:stop], model) - id += 1 - push!(orfs, ORF(get_variable_name(sequence), start:stop, strand, frame, NaiveFinderScored(), lors, score)) + push!(orfs, ORF(seqname, start:stop, strand, frame, NaiveFinderScored(), lors, score)) end end end From 0aaf4e2830bdd9e33c836e29fe9e96e89425696f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Camilo=20Garc=C3=ADa?= Date: Mon, 3 Jun 2024 00:40:56 -0500 Subject: [PATCH 07/40] Major refactor of the ORF type and also add some groundwork for RBS --- src/types.jl | 197 +++++++++++++++++++++------------------------------ 1 file changed, 81 insertions(+), 116 deletions(-) diff --git a/src/types.jl b/src/types.jl index 68df0f9..2aa52a4 100644 --- a/src/types.jl +++ b/src/types.jl @@ -1,152 +1,117 @@ -export ORF -export NaiveFinder, NaiveFinderScored -export NaiveScoringScheme -export finder, frame, scheme, score -# Structs associated with gene models -abstract type AbstractGene end - -""" - struct GeneFeatures - seqname::String - start::Int64 - stop::Int64 - strand::Char - frame::Int8 - score::Union{Nothing, Float64} # Add score field - attribute::Dict - end - -This is the main Gene struct, based on the fields that could be found in a GFF3, still needs to be defined correctly, - The idea is correct the frame and attributes that will have something like a possible list (id=Char;name=;locus_tag). - The `write` and `get` functions should have a dedicated method for this struct. -""" -# struct GeneFeatures <: AbstractGene -# seqname::String -# start::Int64 -# stop::Int64 -# strand::Char -# frame::Int8 # But maybe a Union to allow empty when reading a GFF? -# score::Union{Float64, Nothing} # Add score field -# attribute::Dict # Should be a Dict perhaps or a NamedTuple -# end +import GenomicFeatures: first, last, length, strand, groupname +import FASTX: sequence -# struct GeneFeatures <: AbstractGene -# seqname::String could be a Union{String, Nothing} and named id. -# orf::ORF -# attribute::Dict -# end - -""" - struct ORF - location::UnitRange{Int64} - strand::Char - frame::Int - score::Union{Nothing, Float64} - end - -The ORF struct represents an open reading frame in a DNA sequence. It has two fields: - -- `location`: which is a UnitRange{Int64} indicating the start and end locations of the ORF in the sequence -- `strand`: is a Char type indicating whether the ORF is on the forward ('+') or reverse ('-') strand of the sequence. -- `frame`: is an Int type indicating the reading frame of the ORF. The frame is the position of the first nucleotide of the codon that starts the ORF, relative to the start of the sequence. It can be 1, 2, or 3. -- `score`: is a Union{Nothing, Float64} type indicating the score of the ORF. It can be a Float64 or nothing. -""" -# struct ORF <: AbstractGene # could also be mutable so that scores can be updated later, but tests fails -# #TODOs: location might be a complex Union allowing UnitRange or a Join of ranges (e.g. 1..100, 200..300)? -# location::UnitRange{Int64} # Note that it is also called position for gene struct in GenomicAnotations -# strand::Char -# frame::Int # Use Int64 instead of Int -# score::Union{Nothing, Float64} # Add score field shold be a namedtuple with the score and the method used to score -# # rbs::Union{Nothing, Vector{UnitRange{Int64}}} # Add RBS field see https://github.com/deprekate/PHANOTATE/blob/c77e80caef8dc3264f7dc698b087bcd486216bcb/phanotate_modules/functions.py#L48 - -# function ORF(location::UnitRange{Int64}, strand::Char, frame::Int, score::Union{Nothing, Float64} = nothing) -# @assert frame in (1, 2, 3) "Invalid frame value. Frame must be 1, 2, or 3." -# @assert strand in ('+', '-') "Invalid strand value. Strand must be '+' or '-'." -# @assert location[1] < location[end] "Invalid location. Start must be less than stop." -# @assert score isa Union{Nothing, Float64} "Invalid score value. Score must be a Float64 or nothing." - -# return new(location, strand, frame, score) -# end -# end +export RBS, ORF +export gc, features, sequence +export groupname, finder, frame, scheme, score -## Interfaces for gene finders +#### Ribosome Binding Site (RBS) struct #### -abstract type GeneFinderMethod end -struct NaiveFinder <: GeneFinderMethod end -struct NaiveFinderScored <: GeneFinderMethod end +# seq[orf.first-bin01.offset.start:orf.first-1] +# motifs = [dna"GGA", dna"GAG", dna"AGG"] +struct RBS + motif::BioRegex{DNA} + offset::UnitRange{Int64} # offset + score::Float64 -## Interfaces for gene scoring schemes - -abstract type GeneScoringScheme end -struct NaiveScoringScheme <: GeneScoringScheme end - + function RBS(motif::BioRegex{DNA}, offset::UnitRange{Int64}, score::Float64) + return new(motif, offset, score) + end +end -# export ORF +# Structs associated with gene models -struct ORF{F} <: GenomicFeatures.AbstractGenomicInterval{F} +# const FEATUREDICT = Dict{Symbol,Any}(:gc => 0.0, :length => 0, :score => 0.0) +# const Feature = Union{Real, Vector{RBS}} +struct ORF{N,F} <: GenomicFeatures.AbstractGenomicInterval{F} groupname::String first::Int64 last::Int64 strand::Strand frame::Int - finder::F - scheme::Union{Nothing, Function} - score::Union{Nothing, Float64} - rbs::Union{Nothing, Vector{UnitRange{Int64}}} # Add RBS field + seq::LongSubSeq{DNAAlphabet{N}} + features::Dict{Symbol,Any} + scheme::Union{Nothing,Function} end -function ORF( +function ORF{N,F}( + ::Type{F}, #finder groupname::String, first::Int64, last::Int64, strand::Union{Strand,Char}, frame::Int, - finder::F, - scheme::Union{Nothing, Function}=nothing, - score::Union{Nothing, Float64}=nothing, - rbs::Union{Nothing, Vector{UnitRange{Int64}}}=nothing -) where {F <: GeneFinderMethod} - # @assert frame in (1, 2, 3) "Invalid frame value. Frame must be 1, 2, or 3." - return ORF{F}(groupname, first, last, strand, frame, finder, scheme, score, rbs) + seq::LongSubSeq{DNAAlphabet{N}}, + features::Dict{Symbol,Any}, + scheme::Union{Nothing,Function}=nothing +) where {N,F<:GeneFinderMethod} + # @assert frame in (1, 2, 3) && location[1] < location[end] && score isa Union{Nothing, Float64} && length(seq) == last(location) - first(location) + 1 && length(seq) % 3 == 0 "Invalid input. Please check the frame, location, score, and sequence length." + return ORF{N,F}(groupname, first, last, strand, frame, seq, features, scheme) #finder end -function ORF( - id::String, - location::UnitRange{Int64}, - strand::Union{Strand,Char}, - frame::Int, finder::F, - scheme::Union{Nothing, Function}=nothing, - score::Union{Nothing, Float64}=nothing, - rbs::Union{Nothing, Vector{UnitRange{Int64}}}=nothing -) where {F <: GeneFinderMethod} - # @assert frame in (1, 2, 3) "Invalid frame value. Frame must be 1, 2, or 3." - return ORF{F}(id, first(location), last(location), strand, frame, finder, scheme, score, rbs) +function groupname(i::ORF{N,F}) where {N,F} + return i.groupname +end + +function first(i::ORF{N,F}) where {N,F} + return i.first end -function frame(i::ORF) +function last(i::ORF{N,F}) where {N,F} + return i.last +end + +function frame(i::ORF{N,F}) where {N,F} return i.frame end -function scheme(i::ORF) +finder(i::ORF{N,F}) where {N,F} = F + +function scheme(i::ORF{N,F}) where {N,F} return i.scheme end -function score(i::ORF) - return i.score +function sequence(i::ORF{N,F}) where {N,F} + return i.strand == STRAND_POS ? i.seq : reverse_complement(i.seq) +end + +function score(i::ORF{N,F}) where {N,F} + return i.features[:score] +end + +function gc(i::ORF{N,F}) where {N,F} + return i.features[:gc] +end + +function length(i::ORF{N,F}) where {N,F} + return i.features[:length] end -function finder(i::ORF) - return i.finder +function features(i::ORF{N,F}) where {N,F} + return i.features end -function Base.show(io::IO, i::ORF) +function Base.show(io::IO, i::ORF{N,F}) where {N,F} if get(io, :compact, false) - # print(io, groupname(i), ":", leftposition(i), "-", rightposition(i)) - print(io, "ORF{$(typeof(finder(i)))}($(leftposition(i)):$(rightposition(i)), '$(strand(i))', $(frame(i)), $(score(i)))") + print(io, "ORF{$(finder(i))}($(leftposition(i)):$(rightposition(i)), '$(strand(i))', $(frame(i)))") #{$(typeof(finder(i)))} $(score(i)) else - # print(io, "ORF{$(typeof(finder(i))),$(scheme(i))}($(groupname(i)), $(leftposition(i)):$(rightposition(i)), $(strand(i)), $(frame(i)), $(round(score(i), digits=5)))") - print(io, "ORF{$(typeof(finder(i)))}($(leftposition(i)):$(rightposition(i)), '$(strand(i))', $(frame(i)), $(score(i)))") - # println(io, " ", groupname(i), " ", leftposition(i), ":", rightposition(i), ", ", strand(i), ", ", frame(i), ", score: $(round(score(i), digits=2))") + print(io, "ORF{$(finder(i))}($(leftposition(i)):$(rightposition(i)), '$(strand(i))', $(frame(i)))") # , $(score(i)) end -end \ No newline at end of file +end + +## Ideas for Gene struct + +# struct CDS <: AbstractGene +# orf::ORF +# coding::Bool +# end + + +# frm = 1 +# frst = 1 +# lst = 99 +# str = '+' +# seq = randdnaseq(99) + +# orf = ORF{4,NaiveFinder}("seq1", frst, lst, str, frm, seq, nothing, nothing, nothing) \ No newline at end of file From 14fff61c20a511b9b96702873eb115b076f86cd4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Camilo=20Garc=C3=ADa?= Date: Mon, 3 Jun 2024 00:41:07 -0500 Subject: [PATCH 08/40] Update deps --- Manifest.toml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Manifest.toml b/Manifest.toml index 65da618..0cc8efb 100644 --- a/Manifest.toml +++ b/Manifest.toml @@ -147,9 +147,9 @@ uuid = "fa267f1f-6049-4f14-aa54-33bafae1ed76" version = "1.0.3" [[deps.TranscodingStreams]] -git-tree-sha1 = "5d54d076465da49d6746c647022f3b3674e64156" +git-tree-sha1 = "a947ea21087caba0a798c5e494d0bb78e3a1a3a0" uuid = "3bb67fe8-82b1-5028-8e26-92a6c54297fa" -version = "0.10.8" +version = "0.10.9" [deps.TranscodingStreams.extensions] TestExt = ["Test", "Random"] From 4dc47a6bab3e1d7ec61ffb4026352cf225b2a2d5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Camilo=20Garc=C3=ADa?= Date: Mon, 3 Jun 2024 00:41:47 -0500 Subject: [PATCH 09/40] Improve getindex with some types --- docs/src/simplecodingrule.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/src/simplecodingrule.md b/docs/src/simplecodingrule.md index b92e9a7..c6bbaf6 100644 --- a/docs/src/simplecodingrule.md +++ b/docs/src/simplecodingrule.md @@ -17,7 +17,7 @@ using GeneFinder, BioSequences seq = dna"TTCGTCAGTCGTTCTGTTTCATTCAATACGATAGTAATGTATTTTTCGTGCATTTCCGGTGGAATCGTGCCGTCCAGCATAGCCTCCAGATATCCCCTTATAGAGGTCAGAGGGGAACGGAAATCGTGGGATACATTGGCTACAAACTTTTTCTGATCATCCTCGGAACGGGCAATTTCGCTTGCCATATAATTCAGACAGGAAGCCAGATAACCGATTTCATCCTCACTATCGACCTGAAATTCATAATGCATATTACCGGCAGCATACTGCTCTGTGGCATGAGTGATCTTCCTCAGAGGAATATATACGATCTCAGTGAAAAAGATCAGAATGATCAGGGATAGCAGGAACAGGATTGCCAGGGTGATATAGGAAATATTCAGCAGGTTGTTACAGGATTTCTGAATATCATTCATATCAGTATGGATGACTACATAGCCTTTTACCTTGTAGTTGGAGGTAATGGGAGCAAATACAGTAAGTACATCCGAATCAAAATTACCGAAGAAATCACCAACAATGTAATAGGAGCCGCTGGTTACGGTCGAATCAAAATTCTCAATGACAACCACATTCTCCACATCTAAGGGACTATTGGTATCCAGTACCAGTCGTCCGGAGGGATTGATGATGCGAATCTCGGAATTCAGGTAGACCGCCAGGGAGTCCAGCTGCATTTTAACGGTCTCCAAAGTTGTTTCACTGGTGTACAATCCGCCGGCATAGGTTCCGGCGATCAGGGTTGCTTCGGAATAGAGACTTTCTGCCTTTTCCCGGATCAGATGTTCTTTGGTCATATTGGGAACAAAAGTTGTAACAATGATGAAACCAAATACACCAAAAATAAAATATGCGAGTATAAATTTTAGATAAAGTGTTTTTTTCATAACAAATCCTGCTTTTGGTATGACTTAATTACGTACTTCGAATTTATAGCCGATGCCCCAGATGGTGCTGATCTTCCAGTTGGCATGATCCTTGATCTTCTC" -orfs = findorfs(seq, min_len=75, NaiveFinderScored()) +orfs = findorfs(seq, minlen=75, NaiveFinderScored()) ``` ```julia @@ -48,7 +48,7 @@ Where the ``P_{C}`` is the probability of the sequence given a CDS model, ``P_{N In this package we have implemented this rule and call some basic models of CDS and No-CDS of *E. coli* from Axelson-Fisk (2015) work (implemented in `BioMarkovChains.jl` package). To check whether a random sequence could be coding based on these decision we use the predicate `log_odds_ratio_decision_rule` with the `ECOLICDS` and `ECOLINOCDS` models: ```julia -orfsdna = getorfs(seq,DNAAlphabet{4}(), NaiveFinder(), min_len=75, alternative_start=true) +orfsdna = getorfs(seq,DNAAlphabet{4}(), NaiveFinder(), minlen=75, alternative_start=true) 20-element Vector{LongSubSeq{DNAAlphabet{4}}}: ATGTATTTTTCGTGCATTTCCGGTGGAATCGTGCCGTCC…CGGAAATCGTGGGATACATTGGCTACAAACTTTTTCTGA @@ -118,7 +118,7 @@ orfs = filter(orf -> iscoding(orf), orfsdna) Or in terms of the `ORF` object: ```julia -orfs = findorfs(seq, min_len=75, NaiveFinderScored(), alternative_start=true) # find ORFs with alternative start as well +orfs = findorfs(seq, minlen=75, NaiveFinderScored(), alternative_start=true) # find ORFs with alternative start as well orfs[iscoding.(orfsdna)] 3-element Vector{ORF}: From 61ac5c79e396f8d5ea4483c8a53d850c18d794ad Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Camilo=20Garc=C3=ADa?= Date: Mon, 3 Jun 2024 00:42:28 -0500 Subject: [PATCH 10/40] Up translate and other utils are now more stable --- src/extended.jl | 32 +++++++------------------------- 1 file changed, 7 insertions(+), 25 deletions(-) diff --git a/src/extended.jl b/src/extended.jl index 4d08a56..e45cd44 100644 --- a/src/extended.jl +++ b/src/extended.jl @@ -1,28 +1,10 @@ # Methods from main packages that expand their fuctions to this package structs -import Base: isless, iterate, sort, getindex +import Base: isless, iterate, sort, getindex, length +import BioSequences: translate +Base.isless(a::ORF{N,F}, b::ORF{N,F}) where {N,F} = isless(a.first:a.last, b.first:b.last) -# Base.isless(a::ORF, b::ORF) = isless(a.location, b.location) -Base.isless(a::ORF, b::ORF) = isless(a.first:a.last, b.first:b.last) -# Base.isless(a::ORF, b::ORF) = isless(a.location, b.location) -# Base.isless(a::ORF, b::ORF) = isless(a.score, b.score) - -# Base.sort(v::Vector{<:ORF}; kwargs...) = sort(v, by = _orf_sort_key) -# Base.sort!(v::Vector{<:ORF}; kwargs...) = sort!(v, by = _orf_sort_key) - - -#TODOs: how to make more robust the getindex method? confroning the frames? -# Base.getindex(sequence::NucleicSeqOrView{A}, orf::ORF) where {A} = orf.strand == '+' ? (@view sequence[orf.location]) : reverse_complement(@view sequence[orf.location]) - -# function getindex(sequence::NucleicSeqOrView{A}, orf::ORF) where {A} -# if orf.strand == '+' -# return @view sequence[orf.location] -# else -# return reverse_complement(@view sequence[orf.location]) -# end -# end - -function getindex(sequence::NucleicSeqOrView{A}, orf::ORF) where {A} +function getindex(sequence::NucleicSeqOrView{A}, orf::ORF{N,F}) where {A,N,F} if orf.strand == '+' || orf.strand == STRAND_POS return @view sequence[orf.first:orf.last] else @@ -30,6 +12,6 @@ function getindex(sequence::NucleicSeqOrView{A}, orf::ORF) where {A} end end -# function _orf_sort_key(orf::ORF) -# return (orf.location, orf.strand, orf.score) -# end \ No newline at end of file +function translate(orf::ORF{N,F}; kwargs...) where {N,F} + return translate(sequence(orf); kwargs...) +end \ No newline at end of file From 27ca3f8cc134b792adc59bd5f3ee5ccfdd951495 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Camilo=20Garc=C3=ADa?= Date: Mon, 3 Jun 2024 00:42:44 -0500 Subject: [PATCH 11/40] Update the min_len into minlen --- scripts/findorfs-script.jl | 12 ++ src/GeneFinder.jl | 19 ++- src/algorithms/naivecollector.jl | 49 ++++++ src/algorithms/naivefinder.jl | 232 ++++++++++----------------- src/criteria/lordr.jl | 65 ++++++++ src/findorfs.jl | 35 ++-- src/getorfs.jl | 265 +++++++++++++++---------------- src/io.jl | 260 +++++++----------------------- src/iscoding.jl | 4 +- src/utils.jl | 8 +- test/findorfstest.jl | 7 +- test/getindextest.jl | 18 +-- test/runtests.jl | 4 +- 13 files changed, 449 insertions(+), 529 deletions(-) create mode 100644 scripts/findorfs-script.jl create mode 100644 src/algorithms/naivecollector.jl create mode 100644 src/criteria/lordr.jl diff --git a/scripts/findorfs-script.jl b/scripts/findorfs-script.jl new file mode 100644 index 0000000..94459b2 --- /dev/null +++ b/scripts/findorfs-script.jl @@ -0,0 +1,12 @@ +#!/usr/bin/env julia + +fasta = ARGS[1] + +using BioSequences: NucleicSeqOrView, DNAAlphabet, LongDNA +using GeneFinder: findorfs, fasta2bioseq, lors, NaiveFinder +using BenchmarkTools: @btime + +seq = fasta2bioseq(fasta)[1] +time = @btime findorfs(seq, NaiveFinder(), minlen=64, scheme=lors); + +println("Time: $time") \ No newline at end of file diff --git a/src/GeneFinder.jl b/src/GeneFinder.jl index b52e45d..94df7d6 100644 --- a/src/GeneFinder.jl +++ b/src/GeneFinder.jl @@ -5,6 +5,7 @@ using BioSequences: DNA_Gap, DNA_A, DNA_C, DNA_M, DNA_G, DNA_R, DNA_S, DNA_V, DNA_T, DNA_W, DNA_Y, DNA_H, DNA_K, DNA_D, DNA_B, DNA_N, Alphabet, + BioRegex, NucleicAcidAlphabet, DNAAlphabet, AminoAcidAlphabet, @@ -19,31 +20,41 @@ using BioSequences: reverse_complement, randdnaseq, ncbi_trans_table, - translate + translate, + gc_content using BioMarkovChains: BioMarkovChain, dnaseqprobability, ECOLICDS, ECOLINOCDS, log_odds_ratio_score using FASTX: FASTAReader, sequence using IterTools: takewhile, iterated using PrecompileTools: @setup_workload, @compile_workload -using GenomicFeatures: GenomicFeatures, AbstractGenomicInterval, GenomicInterval, Strand, summary, groupname, leftposition, rightposition, strand, metadata, STRAND_POS +using GenomicFeatures: GenomicFeatures, AbstractGenomicInterval, GenomicInterval, Strand, summary, groupname, strand, metadata, STRAND_POS, STRAND_NEG, STRAND_BOTH, STRAND_NA +# Finder Algorithms include("algorithms/naivefinder.jl") +include("algorithms/naivecollector.jl") + +# Coding Criteria +include("criteria/lordr.jl") + +# Main functions include("types.jl") include("findorfs.jl") include("iscoding.jl") -include("getorfs.jl") include("io.jl") + +# Utils and extended functions include("utils.jl") include("extended.jl") @setup_workload begin # Putting some things in `@setup_workload` instead of `@compile_workload` can reduce the size of the # precompile file and potentially make loading faster. + # using BioSequences, GeneFinder seq = randdnaseq(99) @compile_workload begin # all calls in this block will be precompiled, regardless of whether # they belong to your package or not (on Julia 1.8 and higher) - findorfs(seq, NaiveFinder()) + # findorfs(seq) end end diff --git a/src/algorithms/naivecollector.jl b/src/algorithms/naivecollector.jl new file mode 100644 index 0000000..4efb608 --- /dev/null +++ b/src/algorithms/naivecollector.jl @@ -0,0 +1,49 @@ +export NaiveCollector + +# abstract type GeneFinderMethod end +struct NaiveCollector <: GeneFinderMethod end + +function NaiveCollector( + sequence::NucleicSeqOrView{DNAAlphabet{N}}; + alternative_start::Bool = false, + minlen::Int64 = 6, + scheme::Union{Nothing, Function} = nothing, + overlap::Bool = false, + kwargs... +) where {N} + + regorf = alternative_start ? biore"DTG(?:[N]{3})*?T(AG|AA|GA)"dna : biore"ATG(?:[N]{3})*?T(AG|AA|GA)"dna + framedict = Dict(0 => 3, 1 => 1, 2 => 2) + revseq = reverse_complement(sequence) + seqlen = length(sequence) + seqname = get_var_name(sequence) + + function createorfs(x, strand, s) + if length(x.captured[1]:x.captured[3]) < minlen + return nothing + end + if strand == '+' + start = x.captured[1] + stop = x.captured[3] + 1 + frame = framedict[x.captured[1] % 3] + else + start = seqlen - x.captured[3] + stop = seqlen - x.captured[1] + 1 + frame = framedict[(seqlen - x.captured[3]) % 3] + end + seq = @view(sequence[start:stop]) + score = scheme === nothing ? 0 : scheme(@view(s[start:stop]), ECOLICDS) + fts = Dict(:gc => gc_content(seq), :length => length(seq), :score => score) + return ORF{N,NaiveCollector}(seqname, start, stop, strand, frame, seq, fts, scheme) + end + + orfs = Vector{ORF{N,NaiveCollector}}() + for strand in ('+', '-') + s = strand == '-' ? revseq : sequence + matches = eachmatch(regorf, s, overlap) + strandedorfs = filter(!isnothing, [createorfs(x, strand, s) for x in matches]) + append!(orfs, strandedorfs) + end + + return sort!(orfs; kwargs...) +end diff --git a/src/algorithms/naivefinder.jl b/src/algorithms/naivefinder.jl index e02c11d..2c7cbed 100644 --- a/src/algorithms/naivefinder.jl +++ b/src/algorithms/naivefinder.jl @@ -1,6 +1,8 @@ -export naivefinder, naivefinderscored, log_odds_ratio_decision_rule, lordr, lors +export NaiveFinder -const lors = log_odds_ratio_score +abstract type GeneFinderMethod end + +struct NaiveFinder <: GeneFinderMethod end """ locationiterator(sequence::NucleicSeqOrView{DNAAlphabet{N}}; alternative_start::Bool=false) where {N} @@ -29,8 +31,8 @@ function _locationiterator( return itr end -""" - naivefinder(sequence::NucleicAlphabet{DNAAlphabet{N}}; alternative_start::Bool=false, min_len::Int64=6)::Vector{ORF} where {N} +@doc raw""" + naivefinder(sequence::NucleicSeqOrView{DNAAlphabet{N}}; kwargs...) -> Vector{ORF} where {N} A simple implementation that finds ORFs in a DNA sequence. @@ -39,163 +41,103 @@ The `naivefinder` function takes a LongSequence{DNAAlphabet{4}} sequence and ret Extending the starting codons with the `alternative_start = true` will search for ATG, GTG, and TTG. Some studies have shown that in *E. coli* (K-12 strain), ATG, GTG and TTG are used 83 %, 14 % and 3 % respectively. !!! note - This function has not ORFs scoring scheme. Thus it might consider `aa"M*"` a posible encoding protein from the resulting ORFs. - -# Keywords + This function has neither ORFs scoring scheme by default nor length constraints. Thus it might consider `aa"M*"` a posible encoding protein from the resulting ORFs. -- `alternative_start::Bool=false`: If true will pass the extended start codons to search. This will increase 3x the exec. time. -- `min_len::Int64=6`: Length of the allowed ORF. Default value allow `aa"M*"` a posible encoding protein from the resulting ORFs. -""" -function naivefinder( - sequence::NucleicSeqOrView{DNAAlphabet{N}}; - alternative_start::Bool = false, - min_len::Int64 = 6, - scheme::Union{Nothing, Function} = nothing, - kwargs... -) where {N} - seqlen = length(sequence) - framedict = Dict(0 => 3, 1 => 1, 2 => 2) - orfs = Vector{ORF}() - seqname = get_variable_name(sequence) - for strand in ('+', '-') - seq = strand == '-' ? reverse_complement(sequence) : sequence - @inbounds for location in @views _locationiterator(seq; alternative_start) - if length(location) >= min_len - frame = strand == '+' ? framedict[location.start % 3] : framedict[(seqlen - location.stop + 1) % 3] - start = strand == '+' ? location.start : seqlen - location.stop + 1 - stop = start + length(location) - 1 - push!(orfs, ORF(seqname, start:stop, strand, frame, scheme)) #NaiveFinder() - end - end - end - return sort!(orfs; kwargs...) -end - -@doc raw""" - naivefinderscored(sequence::NucleicSeqOrView{DNAAlphabet{N}}; alternative_start=false, min_len=6) where {N} + +# Required Arguments -Find open reading frames (ORFs) in a nucleic acid sequence using a naive scoring algorithm. In this method the scoring is a log-odds ratio score defines as: +- `sequence::NucleicSeqOrView{DNAAlphabet{N}}`: The nucleic acid sequence to search for ORFs. -```math -S(x) = \sum_{i=1}^{L} \beta_{x_{i}x} = \sum_{i=1} \log \frac{a^{\mathscr{m}_{1}}_{i-1} x_i}{a^{\mathscr{m}_{2}}_{i-1} x_i} -``` +# Keywords Arguments -## Arguments -- `sequence`: The nucleic acid sequence to search for ORFs. -- `alternative_start`: A boolean indicating whether to consider alternative start codons. Default is `false`. -- `min_len`: The minimum length of an ORF to be considered. Default is `6`. -- `scoringscheme`: The scoring scheme to use for the scoring algorithm. Default is `ECOLICDS`. +- `alternative_start::Bool`: If true will pass the extended start codons to search. This will increase 3x the execution time. Default is `false`. +- `minlen::Int64=6`: Length of the allowed ORF. Default value allow `aa"M*"` a posible encoding protein from the resulting ORFs. +- `scheme::Function`: The scoring scheme to use for scoring the sequence from the ORF. Default is `nothing`. -## Returns -A sorted vector of `ORF` objects representing the identified open reading frames. +!!! note + As the scheme is generally a scoring function that at least requires a sequence, one simple scheme is the log-odds ratio score. This score is a log-odds ratio that compares the probability of the sequence generated by a coding model to the probability of the sequence generated by a non-coding model: + ```math + S(x) = \sum_{i=1}^{L} \beta_{x_{i}x} = \sum_{i=1} \log \frac{a^{\mathscr{m}_{1}}_{i-1} x_i}{a^{\mathscr{m}_{2}}_{i-1} x_i} + ``` + If the log-odds ratio exceeds a given threshold (`η`), the sequence is considered likely to be coding. See [`lordr`](@ref) for more information about coding creteria. """ -function naivefinderscored( +# function NaiveFinder( +# sequence::NucleicSeqOrView{DNAAlphabet{N}}; +# alternative_start::Bool = false, +# minlen::Int64 = 6, +# scheme::Union{Nothing, Function} = nothing, +# kwargs... +# ) where {N} +# seqlen = length(sequence) +# framedict = Dict(0 => 3, 1 => 1, 2 => 2) +# orfs = Vector{ORF{N,NaiveFinder}}() +# seqname = get_var_name(sequence) +# for strand in ('+', '-') +# s = strand == '-' ? reverse_complement(sequence) : sequence +# @inbounds for location in @views _locationiterator(s; alternative_start) +# if length(location) >= minlen +# start = strand == '+' ? location.start : seqlen - location.stop + 1 +# stop = start + length(location) - 1 +# frame = strand == '+' ? framedict[location.start % 3] : framedict[(seqlen - location.stop + 1) % 3] +# seq = @view(sequence[start:stop]) +# score = scheme === nothing ? 0 : scheme(@view(s[start:stop]), ECOLICDS) +# gc = gc_content(seq) +# len = length(seq) +# fts = Dict(:gc => gc, :length => len, :score => score) +# push!(orfs, ORF{N,NaiveFinder}(seqname, start, stop, strand, frame, NaiveFinder, seq, fts, scheme)) +# end +# end +# end +# return sort!(orfs; kwargs...) +# end + +function NaiveFinder( sequence::NucleicSeqOrView{DNAAlphabet{N}}; alternative_start::Bool = false, - min_len::Int64 = 6, - model::BioMarkovChain = ECOLICDS, + minlen::Int64 = 6, + scheme::Union{Nothing, Function} = nothing, kwargs... ) where {N} + seqlen = length(sequence) framedict = Dict(0 => 3, 1 => 1, 2 => 2) - orfs = Vector{ORF}() - seqname = get_variable_name(sequence) - for strand in ('+', '-') - seq = strand == '-' ? reverse_complement(sequence) : sequence - - @inbounds for location in @views _locationiterator(seq; alternative_start) - if length(location) >= min_len - frame = strand == '+' ? framedict[location.start % 3] : framedict[(seqlen - location.stop + 1) % 3] - start = strand == '+' ? location.start : seqlen - location.stop + 1 - stop = start + length(location) - 1 - # score = -10log10(dnaseqprobability(seq[start:stop], scoringscheme)) - score = log_odds_ratio_score(seq[start:stop], model) - push!(orfs, ORF(seqname, start:stop, strand, frame, NaiveFinderScored(), lors, score)) - end - end - end + orfs = Vector{ORF{N,NaiveFinder}}() + seqname = get_var_name(sequence) - return sort!(orfs; kwargs...) -end - -@doc raw""" - log_odds_ratio_decision_rule( - sequence::LongSequence{DNAAlphabet{4}}; - codingmodel::BioMarkovChain, - noncodingmodel::BioMarkovChain, - η::Float64 = 1e-5 - ) - -Check if a given DNA sequence is likely to be coding based on a log-odds ratio. - The log-odds ratio is a statistical measure used to assess the likelihood of a sequence being coding or non-coding. It compares the probability of the sequence generated by a coding model to the probability of the sequence generated by a non-coding model. If the log-odds ratio exceeds a given threshold (`η`), the sequence is considered likely to be coding. - It is formally described as a decision rule: - -```math -S(X) = \log \left( \frac{{P_C(X_1=i_1, \ldots, X_T=i_T)}}{{P_N(X_1=i_1, \ldots, X_T=i_T)}} \right) \begin{cases} > \eta & \Rightarrow \text{{coding}} \\ < \eta & \Rightarrow \text{{noncoding}} \end{cases} -``` - -# Arguments -- `sequence::NucleicSeqOrView{DNAAlphabet{N}}`: The DNA sequence to be evaluated. - -## Keyword Arguments -- `codingmodel::BioMarkovChain`: The transition model for coding regions, (default: `ECOLICDS`). -- `noncodingmodel::BioMarkovChain`: The transition model for non-coding regions, (default: `ECOLINOCDS`) -- `η::Float64 = 1e-5`: The threshold value (eta) for the log-odds ratio (default: 1e-5). - -# Returns -- `true` if the sequence is likely to be coding. -- `false` if the sequence is likely to be non-coding. - -# Raises -- `ErrorException`: if the length of the sequence is not divisible by 3. -- `ErrorException`: if the sequence contains a premature stop codon. - -# Example - -``` -sequence = dna"ATGGCATCTAG" -iscoding(sequence) # Returns: true or false -``` -""" -function lordr( #log_odds_ratio_decision, also lordr/cudr/kfdr/aadr - sequence::NucleicSeqOrView{DNAAlphabet{N}}; - codingmodel::BioMarkovChain = ECOLICDS, - noncodingmodel::BioMarkovChain = ECOLINOCDS, - η::Float64 = 1e-5 -) where {N} - pcoding = dnaseqprobability(sequence, codingmodel) - pnoncoding = dnaseqprobability(sequence, noncodingmodel) - - logodds = log(pcoding / pnoncoding) - - length(sequence) % 3 == 0 || error("The sequence is not divisible by 3") + function createorf(location, strand, s, seqlen) + if length(location) < minlen + return nothing + end + + if strand == '+' + start = location.start + stop = start + length(location) - 1 + frame = framedict[location.start % 3] + else + start = seqlen - location.stop + 1 + stop = start + length(location) - 1 + frame = framedict[(seqlen - location.stop + 1) % 3] + end - !hasprematurestop(sequence) || error("There is a premature stop codon in the sequence") + seq = @view(sequence[start:stop]) + score = scheme === nothing ? 0 : scheme(@view(s[start:stop]), ECOLICDS) + gc = gc_content(seq) + len = length(seq) + fts = Dict(:gc => gc, :length => len, :score => score) - if logodds > η - return true - else - false + return ORF{N, NaiveFinder}(seqname, start, stop, strand, frame, seq, fts, scheme) end -end - -const log_odds_ratio_decision_rule = lordr # criteria - -## Another alternative: - -## This is the way window function are implemented in the DSP.jl package - -# function make_orf_finder(finderfunc::Function, sequence::NucleicSeqOrView{DNAAlphabet{N}}) where {N} -# ... -# end -# function locationiterator(sequence::NucleicSeqOrView{DNAAlphabet{N}}) where {N} -# make_orf_finder(sequence::NucleicSeqOrView{DNAAlphabet{N}}) do x -# ... -# end -# end + @inbounds for strand in ('+', '-') + s = strand == '-' ? reverse_complement(sequence) : sequence + @inbounds for location in @views _locationiterator(s; alternative_start) + orf = createorf(location, strand, s, seqlen) + if orf !== nothing + push!(orfs, orf) + end + end + end -# function findorfs(sequence::NucleicSeqOrView{DNAAlphabet{N}}; finder::Function=locationiterator) where {N} -# ... -# end \ No newline at end of file + return sort!(orfs; kwargs...) +end \ No newline at end of file diff --git a/src/criteria/lordr.jl b/src/criteria/lordr.jl new file mode 100644 index 0000000..f2d446e --- /dev/null +++ b/src/criteria/lordr.jl @@ -0,0 +1,65 @@ +export log_odds_ratio_decision_rule, lordr, lors + +@doc raw""" + log_odds_ratio_decision_rule( + sequence::LongSequence{DNAAlphabet{4}}; + codingmodel::BioMarkovChain, + noncodingmodel::BioMarkovChain, + η::Float64 = 1e-5 + ) + +Check if a given DNA sequence is likely to be coding based on a log-odds ratio. + The log-odds ratio is a statistical measure used to assess the likelihood of a sequence being coding or non-coding. It compares the probability of the sequence generated by a coding model to the probability of the sequence generated by a non-coding model. If the log-odds ratio exceeds a given threshold (`η`), the sequence is considered likely to be coding. + It is formally described as a decision rule: + +```math +S(X) = \log \left( \frac{{P_C(X_1=i_1, \ldots, X_T=i_T)}}{{P_N(X_1=i_1, \ldots, X_T=i_T)}} \right) \begin{cases} > \eta & \Rightarrow \text{{coding}} \\ < \eta & \Rightarrow \text{{noncoding}} \end{cases} +``` + +# Arguments +- `sequence::NucleicSeqOrView{DNAAlphabet{N}}`: The DNA sequence to be evaluated. + +## Keyword Arguments +- `codingmodel::BioMarkovChain`: The transition model for coding regions, (default: `ECOLICDS`). +- `noncodingmodel::BioMarkovChain`: The transition model for non-coding regions, (default: `ECOLINOCDS`) +- `η::Float64 = 1e-5`: The threshold value (eta) for the log-odds ratio (default: 1e-5). + +# Returns +- `true` if the sequence is likely to be coding. +- `false` if the sequence is likely to be non-coding. + +# Raises +- `ErrorException`: if the length of the sequence is not divisible by 3. +- `ErrorException`: if the sequence contains a premature stop codon. + +# Example + +``` +sequence = dna"ATGGCATCTAG" +iscoding(sequence) # Returns: true or false +``` +""" +function lordr( #log_odds_ratio_decision, also lordr/cudr/kfdr/aadr + sequence::NucleicSeqOrView{DNAAlphabet{N}}; + codingmodel::BioMarkovChain = ECOLICDS, + noncodingmodel::BioMarkovChain = ECOLINOCDS, + η::Float64 = 1e-5 +) where {N} + pcoding = dnaseqprobability(sequence, codingmodel) + pnoncoding = dnaseqprobability(sequence, noncodingmodel) + + logodds = log(pcoding / pnoncoding) + + length(sequence) % 3 == 0 || error("The sequence is not divisible by 3") + + !hasprematurestop(sequence) || error("There is a premature stop codon in the sequence") + + if logodds > η + return true + else + false + end +end + +const log_odds_ratio_decision_rule = lordr # criteria +const lors = log_odds_ratio_score # from BioMarkovChains \ No newline at end of file diff --git a/src/findorfs.jl b/src/findorfs.jl index 76d8f38..9d093a9 100644 --- a/src/findorfs.jl +++ b/src/findorfs.jl @@ -5,17 +5,15 @@ export findorfs This is the main interface method for finding open reading frames (ORFs) in a DNA sequence. -It takes the following arguments: +It takes the following required arguments: - `sequence`: The nucleic acid sequence to search for ORFs. - `method`: The algorithm used to find ORFs. It can be either `NaiveFinder()`, `NaiveFinderScored()` or yet other implementations. ## Keyword Arguments regardless of the finder method: -- `alternative_start`: A boolean indicating whether to consider alternative start codons. Default is `false`. -- `min_len`: The minimum length of an ORF. Default is `6`. - -## Keyword Arguments for `NaiveFinderScored()`: -- `scoringscheme::BioMarkovChain`: The scoring scheme to use for the scoring algorithm. Default is `ECOLICDS`. +- `alternative_start::Bool`: A boolean indicating whether to consider alternative start codons. Default is `false`. +- `minlen::Int`: The minimum length of an ORF. Default is `6`. +- `scheme::Function`: The scoring scheme to use for scoring the sequence from the ORF. Default is `nothing`. ## Returns A vector of `ORF` objects representing the found ORFs. @@ -31,30 +29,21 @@ sequence = randdnaseq(120) findorfs(sequence, NaiveFinder()) 1-element Vector{ORF}: - ORF(77:118, '-', 2, 0.0) + ORF{NaiveFinder}(77:118, '-', 2, 0.0) ``` """ function findorfs end function findorfs( - sequence::NucleicSeqOrView{DNAAlphabet{N}}, - ::NaiveFinder; - kwargs... -) where {N} - return naivefinder(sequence; kwargs...)::Vector{ORF} -end - -function findorfs( - sequence::NucleicSeqOrView{DNAAlphabet{N}}, - ::NaiveFinderScored; # Should we decoupled the FinderMethod from the ScoringMethod? if so, what strategy to use for add the parameter (as a kwargs, as a ::SpecificScoringMethod or as a ::Type{M} where M<:ScoringMethod)? + sequence::NucleicSeqOrView{DNAAlphabet{N}}; + finder::Type{F}=NaiveFinder, kwargs... -) where {N} - return naivefinderscored(sequence; kwargs...)::Vector{ORF} #alternative_start, min_len, scoringscheme +) where {N, F<:GeneFinderMethod} + return finder(sequence; kwargs...)::Vector{ORF{N,F}} end - ### Some possible ideas: # function findorfs( @@ -62,7 +51,7 @@ end # method::Type{M}; # kwargs... # ) where {N, M<:GeneFinderMethod} -# return method(sequence; kwargs...)::Vector{ORF} #alternative_start, min_len, scoringscheme +# return method(sequence; kwargs...)::Vector{ORF} #alternative_start, minlen, scoringscheme # end # This general implementation would allow for the addition of new finder methods without changing the findorfs method. @@ -82,7 +71,7 @@ end # function NaiveFinderScored( # sequence::NucleicSeqOrView{DNAAlphabet{N}}; # alternative_start::Bool = false, -# min_len::Int64 = 6, +# minlen::Int64 = 6, # scoringscheme::BioMarkovChain = ECOLICDS, # byf::Symbol = :location, # kwargs... @@ -94,7 +83,7 @@ end # seq = strand == '-' ? reverse_complement(sequence) : sequence # @inbounds for location in @views _locationiterator(seq; alternative_start) -# if length(location) >= min_len +# if length(location) >= minlen # frame = strand == '+' ? framedict[location.start % 3] : framedict[(seqlen - location.stop + 1) % 3] # start = strand == '+' ? location.start : seqlen - location.stop + 1 # stop = start + length(location) - 1 diff --git a/src/getorfs.jl b/src/getorfs.jl index b8fefa5..c487aeb 100644 --- a/src/getorfs.jl +++ b/src/getorfs.jl @@ -1,142 +1,131 @@ -export getorfs - -#### get_orfs_* methods #### - -""" - getorfs(sequence::NucleicSeqOrView{DNAAlphabet{N}}, ::Alphabet, method::M; kwargs...) where {N, M<:GeneFinderMethod} - -This function takes a `NucleicSeqOrView{DNAAlphabet{N}}` sequence and identifies the open reading frames (ORFs) using the `findorfs()` function under the selected finder method. The function then extracts the DNA or aminoacid sequence of each ORF and stores it in a `Vector{LongSubSeq{A}}`. - -# Arguments - -- `sequence`: The input sequence as a `NucleicSeqOrView{DNAAlphabet{N}}` -- `alphabet`: The output sequence alphabet. It can be either `DNAAlphabet{4}()` or `AminoAcidAlphabet()`. -- `method`: The method used to find ORFs. It must be a subtype of `GeneFinderMethod`. (e.g., `NaiveFinder()`) - -# Keyword Arguments - -- `alternative_start::Bool=false`: If set to `true`, the function considers alternative start codons when searching for ORFs. This increases the execution time by approximately 3x. -- `min_len::Int64=6`: The minimum length of the allowed ORF. By default, it allows ORFs that can encode at least one amino acid (e.g., `aa"M*"`). - -# Examples - -```julia -sequence = randdnaseq(120) - - 120nt DNA Sequence: - GCCGGACAGCGAAGGCTAATAAATGCCCGTGCCAGTATC…TCTGAGTTACTGTACACCCGAAAGACGTTGTACGCATTT - - -getorfs(sequence, DNAAlphabet{4}(), NaiveFinder()) - - 1-element Vector{LongSubSeq{DNAAlphabet{4}}}: - ATGCGTACAACGTCTTTCGGGTGTACAGTAACTCAGAGCTAA - -``` -""" -function getorfs( - sequence::NucleicSeqOrView{DNAAlphabet{N}}, - ::DNAAlphabet{N}, - method::M; - alternative_start::Bool = false, - min_len::Int64 = 6, - kwargs... -) where {N,M<:GeneFinderMethod} - orfs = findorfs(sequence, method; alternative_start, min_len) - seqs = Vector{LongSubSeq{DNAAlphabet{N}}}(undef, length(orfs)) #Vector{}(undef, length(orfs)) # todo correct the output type - - @inbounds for (i, orf) in enumerate(orfs) - seqs[i] = sequence[orf] - end - - return seqs -end - -function getorfs( - sequence::NucleicSeqOrView{DNAAlphabet{N}}, - ::AminoAcidAlphabet, - method::M; - alternative_start::Bool = false, - min_len::Int64 = 6, - code::GeneticCode = ncbi_trans_table[1] -) where {N,M<:GeneFinderMethod} - orfs = findorfs(sequence, method; alternative_start, min_len) - seqs = Vector{LongSubSeq{AminoAcidAlphabet}}(undef, length(orfs)) - - @inbounds for (i, orf) in enumerate(orfs) - seqs[i] = translate(sequence[orf]; code) - end - - return seqs -end - -#### record_orfs_* methods #### - -""" - record_orfs_fna(sequence::NucleicSeqOrView{DNAAlphabet{N}}; kwargs...) where {N} - -Record Open Reading Frames (ORFs) in a nucleic acid sequence. - -# Arguments -- `sequence::NucleicSeqOrView{DNAAlphabet{N}}`: The nucleic acid sequence to search for ORFs. -- `alternative_start::Bool=false`: If `true`, consider alternative start codons for ORFs. -- `min_len::Int=6`: The minimum length of an ORF to be recorded. - -# Returns -An array of `FASTARecord` objects representing the identified ORFs. - -# Description -This function searches for Open Reading Frames (ORFs) in a given nucleic acid sequence. An ORF is a sequence of DNA that starts with a start codon and ends with a stop codon, without any other stop codons in between. By default, only the standard start codon (ATG) is considered, but if `alternative_start` is set to `true`, alternative start codons are also considered. The minimum length of an ORF to be recorded can be specified using the `min_len` argument. -""" -# function record_orfs_fna( -# sequence::NucleicSeqOrView{DNAAlphabet{N}}; -# alternative_start::Bool = false, -# min_len::Int64 = 6 -# ) where {N} -# orfs = findorfs(sequence; alternative_start, min_len) -# norfs = length(orfs) -# padding = norfs < 10 ? length(string(norfs)) + 1 : length(string(norfs)) -# records = FASTARecord[] -# @inbounds for (index, orf) in enumerate(orfs) -# id = string(lpad(string(index), padding, "0")) -# header = "ORF$(id) id=$(id) start=$(orf.location.start) stop=$(orf.location.stop) strand=$(orf.strand) frame=$(orf.frame)" -# record = FASTARecord(header, sequence[orf]) -# push!(records, record) -# end -# return records +# export getorfs + +# #### get_orfs_* methods #### + +# """ +# getorfs(sequence::NucleicSeqOrView{DNAAlphabet{N}}, ::Alphabet, method::M; kwargs...) where {N, M<:GeneFinderMethod} + +# This function takes a `NucleicSeqOrView{DNAAlphabet{N}}` sequence and identifies the open reading frames (ORFs) using the `findorfs()` function under the selected finder method. The function then extracts the DNA or aminoacid sequence of each ORF and stores it in a `Vector{LongSubSeq{A}}`. + +# # Arguments + +# - `sequence`: The input sequence as a `NucleicSeqOrView{DNAAlphabet{N}}` +# - `alphabet`: The output sequence alphabet. It can be either `DNAAlphabet{4}()` or `AminoAcidAlphabet()`. +# - `method`: The method used to find ORFs. It must be a subtype of `GeneFinderMethod`. (e.g., `NaiveFinder()`) + +# # Keyword Arguments + +# - `alternative_start::Bool=false`: If set to `true`, the function considers alternative start codons when searching for ORFs. This increases the execution time by approximately 3x. +# - `minlen::Int64=6`: The minimum length of the allowed ORF. By default, it allows ORFs that can encode at least one amino acid (e.g., `aa"M*"`). + +# # Examples + +# ```julia +# sequence = randdnaseq(120) + +# 120nt DNA Sequence: +# GCCGGACAGCGAAGGCTAATAAATGCCCGTGCCAGTATC…TCTGAGTTACTGTACACCCGAAAGACGTTGTACGCATTT + + +# getorfs(sequence, DNAAlphabet{4}(), NaiveFinder()) + +# 1-element Vector{LongSubSeq{DNAAlphabet{4}}}: +# ATGCGTACAACGTCTTTCGGGTGTACAGTAACTCAGAGCTAA + +# ``` +# """ +# function getorfs( +# seq::NucleicSeqOrView{DNAAlphabet{N}}, +# ::DNAAlphabet{N}; +# finder::Type{F}=NaiveFinder, +# alternative_start::Bool = false, +# minlen::Int64 = 6, +# kwargs... +# ) where {N,F<:GeneFinderMethod} +# orfs = findorfs(seq; finder, alternative_start, minlen) +# return sequence.(orfs) # end -""" - record_orfs_faa(sequence::NucleicSeqOrView{DNAAlphabet{N}}; kwargs...) where {N} - -This function takes a nucleic acid sequence and finds all open reading frames (ORFs) in the sequence. -An ORF is a sequence of DNA that starts with a start codon and ends with a stop codon. -The function returns a list of FASTA records, where each record represents an ORF in the sequence. - -# Arguments -- `sequence`: The nucleic acid sequence to search for ORFs. -- `alternative_start`: A boolean indicating whether to consider alternative start codons. Default is `false`. -- `code`: The genetic code to use for translation. Default is the NCBI translation table 1. -- `min_len`: The minimum length of an ORF. ORFs shorter than this length will be ignored. Default is 6. - -# Returns -- A list of FASTA records representing the ORFs found in the sequence. -""" -# function record_orfs_faa( -# sequence::NucleicSeqOrView{DNAAlphabet{N}}; -# alternative_start::Bool = false, +# function getorfs( +# seq::NucleicSeqOrView{DNAAlphabet{N}}, +# ::AminoAcidAlphabet; +# finder::Type{F}=NaiveFinder, +# alternative_start::Bool = false, +# minlen::Int64 = 6, # code::GeneticCode = ncbi_trans_table[1], -# min_len::Int64 = 6 -# ) where {N} -# orfs = findorfs(sequence; alternative_start, min_len) -# norfs = length(orfs) -# padding = norfs < 10 ? length(string(norfs)) + 1 : length(string(norfs)) -# records = FASTARecord[] -# @inbounds for (index, orf) in enumerate(orfs) -# id = string(lpad(string(index), padding, "0")) -# header = "ORF$(id) id=$(id) start=$(orf.location.start) stop=$(orf.location.stop) strand=$(orf.strand) frame=$(orf.frame)" -# record = FASTARecord(header, translate(sequence[orf]; code)) -# push!(records, record) -# end -# return records +# kwargs... +# ) where {N,F<:GeneFinderMethod} +# orfs = findorfs(seq; finder, alternative_start, minlen) +# return @. translate(sequence(orfs); code) # end + +# #### record_orfs_* methods #### + +# """ +# record_orfs_fna(sequence::NucleicSeqOrView{DNAAlphabet{N}}; kwargs...) where {N} + +# Record Open Reading Frames (ORFs) in a nucleic acid sequence. + +# # Arguments +# - `sequence::NucleicSeqOrView{DNAAlphabet{N}}`: The nucleic acid sequence to search for ORFs. +# - `alternative_start::Bool=false`: If `true`, consider alternative start codons for ORFs. +# - `minlen::Int=6`: The minimum length of an ORF to be recorded. + +# # Returns +# An array of `FASTARecord` objects representing the identified ORFs. + +# # Description +# This function searches for Open Reading Frames (ORFs) in a given nucleic acid sequence. An ORF is a sequence of DNA that starts with a start codon and ends with a stop codon, without any other stop codons in between. By default, only the standard start codon (ATG) is considered, but if `alternative_start` is set to `true`, alternative start codons are also considered. The minimum length of an ORF to be recorded can be specified using the `minlen` argument. +# """ +# # function record_orfs_fna( +# # sequence::NucleicSeqOrView{DNAAlphabet{N}}; +# # alternative_start::Bool = false, +# # minlen::Int64 = 6 +# # ) where {N} +# # orfs = findorfs(sequence; alternative_start, minlen) +# # norfs = length(orfs) +# # padding = norfs < 10 ? length(string(norfs)) + 1 : length(string(norfs)) +# # records = FASTARecord[] +# # @inbounds for (index, orf) in enumerate(orfs) +# # id = string(lpad(string(index), padding, "0")) +# # header = "ORF$(id) id=$(id) start=$(orf.location.start) stop=$(orf.location.stop) strand=$(orf.strand) frame=$(orf.frame)" +# # record = FASTARecord(header, sequence[orf]) +# # push!(records, record) +# # end +# # return records +# # end + +# """ +# record_orfs_faa(sequence::NucleicSeqOrView{DNAAlphabet{N}}; kwargs...) where {N} + +# This function takes a nucleic acid sequence and finds all open reading frames (ORFs) in the sequence. +# An ORF is a sequence of DNA that starts with a start codon and ends with a stop codon. +# The function returns a list of FASTA records, where each record represents an ORF in the sequence. + +# # Arguments +# - `sequence`: The nucleic acid sequence to search for ORFs. +# - `alternative_start`: A boolean indicating whether to consider alternative start codons. Default is `false`. +# - `code`: The genetic code to use for translation. Default is the NCBI translation table 1. +# - `minlen`: The minimum length of an ORF. ORFs shorter than this length will be ignored. Default is 6. + +# # Returns +# - A list of FASTA records representing the ORFs found in the sequence. +# """ +# # function record_orfs_faa( +# # sequence::NucleicSeqOrView{DNAAlphabet{N}}; +# # alternative_start::Bool = false, +# # code::GeneticCode = ncbi_trans_table[1], +# # minlen::Int64 = 6 +# # ) where {N} +# # orfs = findorfs(sequence; alternative_start, minlen) +# # norfs = length(orfs) +# # padding = norfs < 10 ? length(string(norfs)) + 1 : length(string(norfs)) +# # records = FASTARecord[] +# # @inbounds for (index, orf) in enumerate(orfs) +# # id = string(lpad(string(index), padding, "0")) +# # header = "ORF$(id) id=$(id) start=$(orf.location.start) stop=$(orf.location.stop) strand=$(orf.strand) frame=$(orf.frame)" +# # record = FASTARecord(header, translate(sequence[orf]; code)) +# # push!(records, record) +# # end +# # return records +# # end diff --git a/src/io.jl b/src/io.jl index 9e0a88f..8924eaf 100644 --- a/src/io.jl +++ b/src/io.jl @@ -15,16 +15,17 @@ Write BED data to a file. # Keywords - `alternative_start::Bool=false`: If true, alternative start codons will be used when identifying CDSs. Default is `false`. -- `min_len::Int64=6`: The minimum length that a CDS must have in order to be included in the output file. Default is `6`. +- `minlen::Int64=6`: The minimum length that a CDS must have in order to be included in the output file. Default is `6`. """ function write_orfs_bed( input::Union{String,NucleicSeqOrView{DNAAlphabet{N}}}, - output::Union{IOStream, IOBuffer}, - method::M; + output::Union{IOStream, IOBuffer}; + finder::Type{F}=NaiveFinder, alternative_start::Bool = false, - min_len::Int64 = 6 -) where {N , M<:GeneFinderMethod} - orfs = findorfs(input, method; alternative_start, min_len) + minlen::Int64 = 6 +) where {N,F<:GeneFinderMethod} + sequence = typeof(input) === String ? fasta2bioseq(input)[1] : input + orfs = findorfs(sequence; finder, alternative_start, minlen) @inbounds for orf in orfs println(output, orf.first, "\t", orf.last, "\t", orf.strand, "\t", orf.frame) end @@ -32,12 +33,13 @@ end function write_orfs_bed( input::Union{String,NucleicSeqOrView{DNAAlphabet{N}}}, - output::String, - method::M; + output::String; + finder::Type{F}=NaiveFinder, alternative_start::Bool = false, - min_len::Int64 = 6 -) where {N, M<:GeneFinderMethod} - orfs = findorfs(input, method; alternative_start, min_len) + minlen::Int64 = 6 +) where {N, F<:GeneFinderMethod} + sequence = typeof(input) === String ? fasta2bioseq(input)[1] : input + orfs = findorfs(sequence; finder, alternative_start, minlen) open(output, "w") do f @inbounds for orf in orfs write(f, "$(orf.first)\t$(orf.last)\t$(orf.strand)\t$(orf.frame)\n") @@ -45,36 +47,6 @@ function write_orfs_bed( end end -# function write_orfs_bed( -# input::String, -# output::Union{IOStream, IOBuffer}, -# method::M; -# alternative_start::Bool = false, -# min_len::Int64 = 6 -# ) where {M<:GeneFinderMethod} -# input = fasta_to_dna(input)[1] # rewrite input to be a DNA sequence -# orfs = findorfs(input, method; alternative_start, min_len) -# @inbounds for orf in orfs -# println(output, orf.first, "\t", orf.last, "\t", orf.strand, "\t", orf.frame) -# end -# end - -# function write_orfs_bed( -# input::String, -# output::String, -# method::M; -# alternative_start::Bool = false, -# min_len::Int64 = 6 -# ) where {M<:GeneFinderMethod} -# input = fasta_to_dna(input)[1] -# orfs = findorfs(input, method; alternative_start, min_len) -# open(output, "w") do f -# @inbounds for orf in orfs -# write(f, "$(orf.first)\t$(orf.last)\t$(orf.strand)\t$(orf.frame)\n") -# end -# end -# end - """ write_orfs_fna(input::NucleicSeqOrView{DNAAlphabet{N}}, output::Union{IOStream, IOBuffer}, method::M; kwargs...) where {N, M<:GeneFinderMethod} write_orfs_fna(input::NucleicSeqOrView{DNAAlphabet{N}}, output::String, method::M; kwargs...) where {N, M<:GeneFinderMethod} @@ -92,7 +64,7 @@ Write a file containing the coding sequences (CDSs) of a given DNA sequence to t # Keywords - `alternative_start::Bool=false`: If true, alternative start codons will be used when identifying CDSs. Default is `false`. -- `min_len::Int64=6`: The minimum length that a CDS must have in order to be included in the output file. Default is `6`. +- `minlen::Int64=6`: The minimum length that a CDS must have in order to be included in the output file. Default is `6`. # Examples @@ -108,76 +80,41 @@ end """ function write_orfs_fna( input::Union{String, NucleicSeqOrView{DNAAlphabet{N}}}, - output::Union{IOStream, IOBuffer}, - method::M; + output::Union{IOStream, IOBuffer}; + finder::Type{F}=NaiveFinder, alternative_start::Bool = false, - min_len::Int64 = 6 -) where {N, M<:GeneFinderMethod} - orfs = findorfs(input, method; alternative_start, min_len) + minlen::Int64 = 6 +) where {N, F<:GeneFinderMethod} + seq = typeof(input) === String ? fasta2bioseq(input)[1] : input + orfs = findorfs(seq; finder, alternative_start, minlen) norfs = length(orfs) padding = norfs < 10 ? length(string(norfs)) + 1 : length(string(norfs)) @inbounds for (i, orf) in enumerate(orfs) id = string(lpad(string(i), padding, "0")) println(output, ">", orf.groupname, " id=", id, " start=", orf.first, " stop=", orf.last, " strand=", orf.strand, " frame=", orf.frame, " score=", orf.score) - println(output, input[orf]) + println(output, sequence(orf)) end end function write_orfs_fna( input::Union{String,NucleicSeqOrView{DNAAlphabet{N}}}, - output::String, - method::M; + output::String; + finder::Type{F}=NaiveFinder, alternative_start::Bool = false, - min_len::Int64 = 6 -) where {N, M<:GeneFinderMethod} - orfs = findorfs(input, method; alternative_start, min_len) + minlen::Int64 = 6 +) where {N, F<:GeneFinderMethod} + seq = typeof(input) === String ? fasta2bioseq(input)[1] : input + orfs = findorfs(seq; finder, alternative_start, minlen) norfs = length(orfs) padding = norfs < 10 ? length(string(norfs)) + 1 : length(string(norfs)) open(output, "w") do f for (i, orf) in enumerate(orfs) id = string(lpad(string(i), padding, "0")) - write(f, ">$(orf.groupname) id=$(id) start=$(orf.first) stop=$(orf.last) strand=$(orf.strand) frame=$(orf.frame) score=$(orf.score)\n$(input[orf])\n") # i.strand == '+' ? input[i.location] : reverse_complement(@view input[i.location]) + write(f, ">$(orf.groupname) id=$(id) start=$(orf.first) stop=$(orf.last) strand=$(orf.strand) frame=$(orf.frame) score=$(orf.score)\n$(sequence(orf))\n") # i.strand == '+' ? input[i.location] : reverse_complement(@view input[i.location]) end end end -# function write_orfs_fna( -# input::String, -# output::Union{IOStream, IOBuffer}, -# method::M; -# alternative_start::Bool = false, -# min_len::Int64 = 6 -# ) where {M<:GeneFinderMethod} -# input = fasta_to_dna(input)[1] -# orfs = findorfs(input, method; alternative_start, min_len) -# norfs = length(orfs) -# padding = norfs < 10 ? length(string(norfs)) + 1 : length(string(norfs)) -# @inbounds for (i, orf) in enumerate(orfs) -# id = string(lpad(string(i), padding, "0")) -# println(output, ">ORF", id, " id=", id, " start=", orf.first, " stop=", orf.last, " strand=", orf.strand, " frame=", orf.frame) -# println(output, input[orf]) -# end -# end - -# function write_orfs_fna( -# input::String, -# output::String, -# method::M; -# alternative_start::Bool = false, -# min_len::Int64 = 6 -# ) where {M<:GeneFinderMethod} -# input = fasta_to_dna(input)[1] -# orfs = findorfs(input, method; alternative_start, min_len) -# norfs = length(orfs) -# padding = norfs < 10 ? length(string(norfs)) + 1 : length(string(norfs)) -# open(output, "w") do f -# for (i, orf) in enumerate(orfs) -# id = string(lpad(string(i), padding, "0")) -# write(f, ">ORF$(id) id=$(id) start=$(orf.first) stop=$(orf.last) strand=$(orf.strand) frame=$(orf.frame)\n$(input[orf])\n") # i.strand == '+' ? input[i.location] : reverse_complement(@view input[i.location]) -# end -# end -# end - """ write_orfs_faa(input::NucleicSeqOrView{DNAAlphabet{4}}, output::Union{IOStream, IOBuffer}, method::M; kwargs...) where {N, M<:GeneFinderMethod} write_orfs_faa(input::NucleicSeqOrView{DNAAlphabet{4}}, output::String, method::M; kwargs...) where {N, M<:GeneFinderMethod} @@ -195,7 +132,7 @@ Write the protein sequences encoded by the coding sequences (CDSs) of a given DN - `code::GeneticCode=BioSequences.standard_genetic_code`: The genetic code by which codons will be translated. See `BioSequences.ncbi_trans_table` for more info. - `alternative_start::Bool=false`: If true will pass the extended start codons to search. This will increase 3x the exec. time. -- `min_len::Int64=6`: Length of the allowed ORF. Default value allow `aa"M*"` a posible encoding protein from the resulting ORFs. +- `minlen::Int64=6`: Length of the allowed ORF. Default value allow `aa"M*"` a posible encoding protein from the resulting ORFs. # Examples @@ -210,77 +147,40 @@ end ``` """ function write_orfs_faa( - input::NucleicSeqOrView{DNAAlphabet{N}}, - output::Union{IOStream, IOBuffer}, - method::M; + input::Union{String, NucleicSeqOrView{DNAAlphabet{N}}}, + output::Union{IOStream, IOBuffer}; + finder::Type{F}=NaiveFinder, alternative_start::Bool = false, - min_len::Int64 = 6, + minlen::Int64 = 6, code::GeneticCode = ncbi_trans_table[1] -) where {N, M<:GeneFinderMethod} - orfs = findorfs(input, method; alternative_start, min_len) +) where {N,F<:GeneFinderMethod} + seq = typeof(input) === String ? fasta2bioseq(input)[1] : input + orfs = findorfs(seq; finder, alternative_start, minlen) norfs = length(orfs) padding = norfs < 10 ? length(string(norfs)) + 1 : length(string(norfs)) @inbounds for (i, orf) in enumerate(orfs) id = string(lpad(string(i), padding, "0")) println(output, ">ORF", id, " id=", id, " start=", orf.first, " stop=", orf.last, " strand=", orf.strand, " frame=", orf.frame) - println(output, translate(input[orf]; code)) - end -end - -function write_orfs_faa( - input::NucleicSeqOrView{DNAAlphabet{N}}, - output::String, - method::M; - alternative_start::Bool = false, - min_len::Int64 = 6, - code::GeneticCode = ncbi_trans_table[1] -) where {N, M<:GeneFinderMethod} - orfs = findorfs(input, method; alternative_start, min_len) - norfs = length(orfs) - padding = norfs < 10 ? length(string(norfs)) + 1 : length(string(norfs)) - open(output, "w") do f - for (i, orf) in enumerate(orfs) - id = string(lpad(string(i), padding, "0")) - write(f, ">ORF$(id) id=$(id) start=$(orf.first) stop=$(orf.last) strand=$(orf.strand) frame=$(orf.frame)\n$(translate(input[orf]; code))\n") - end + println(output, translate(orf; code)) end end function write_orfs_faa( - input::String, - output::Union{IOStream, IOBuffer}, - method::M; + input::Union{String, NucleicSeqOrView{DNAAlphabet{N}}}, + output::String; + finder::Type{F}=NaiveFinder, alternative_start::Bool = false, - min_len::Int64 = 6, + minlen::Int64 = 6, code::GeneticCode = ncbi_trans_table[1] -) where {M<:GeneFinderMethod} - input = fasta_to_dna(input)[1] - orfs = findorfs(input, method; alternative_start, min_len) - norfs = length(orfs) - padding = norfs < 10 ? length(string(norfs)) + 1 : length(string(norfs)) - @inbounds for (i, orf) in enumerate(orfs) - id = string(lpad(string(i), padding, "0")) - println(output, ">ORF", id, " id=", id, " start=", orf.first, " stop=", orf.last, " strand=", orf.strand, " frame=", orf.frame) - println(output, translate(input[orf]; code)) - end -end - -function write_orfs_faa( - input::String, - output::String, - method::M; - alternative_start::Bool = false, - code::GeneticCode = ncbi_trans_table[1], - min_len::Int64 = 6, -) where {M<:GeneFinderMethod} - input = fasta_to_dna(input)[1] - orfs = findorfs(input, method; alternative_start, min_len) +) where {N,F<:GeneFinderMethod} + seq = typeof(input) === String ? fasta2bioseq(input)[1] : input + orfs = findorfs(seq; finder, alternative_start, minlen) norfs = length(orfs) padding = norfs < 10 ? length(string(norfs)) + 1 : length(string(norfs)) open(output, "w") do f for (i, orf) in enumerate(orfs) id = string(lpad(string(i), padding, "0")) - write(f, ">ORF$(id) id=$(id) start=$(orf.first) stop=$(orf.last) strand=$(orf.strand) frame=$(orf.frame)\n$(translate(input[orf]; code))\n") + write(f, ">ORF$(id) id=$(id) start=$(orf.first) stop=$(orf.last) strand=$(orf.strand) frame=$(orf.frame)\n$(translate(orf; code))\n") end end end @@ -302,34 +202,36 @@ Write GFF data to a file. - `code::GeneticCode=BioSequences.standard_genetic_code`: The genetic code by which codons will be translated. See `BioSequences.ncbi_trans_table` for more info. - `alternative_start::Bool=false`: If true will pass the extended start codons to search. This will increase 3x the exec. time. -- `min_len::Int64=6`: Length of the allowed ORF. Default value allow `aa"M*"` a posible encoding protein from the resulting ORFs. +- `minlen::Int64=6`: Length of the allowed ORF. Default value allow `aa"M*"` a posible encoding protein from the resulting ORFs. """ function write_orfs_gff( - input::NucleicSeqOrView{A}, - output::Union{IOStream, IOBuffer}, - method::M; + input::Union{String, NucleicSeqOrView{DNAAlphabet{N}}}, + output::Union{IOStream, IOBuffer}; + finder::Type{F}=NaiveFinder, alternative_start::Bool = false, - min_len::Int64 = 6, -) where {A, M<:GeneFinderMethod} - orfs = findorfs(input, method; alternative_start, min_len) + minlen::Int64 = 6, +) where {N, F<:GeneFinderMethod} + seq = typeof(input) === String ? fasta2bioseq(input)[1] : input + orfs = findorfs(seq; finder, alternative_start, minlen) norfs = length(orfs) padding = norfs < 10 ? length(string(norfs)) + 1 : length(string(norfs)) println(output, "##gff-version 3\n##sequence-region Chr 1 $(length(input))") for (i, orf) in enumerate(orfs) id = string("ORF", lpad(string(i), padding, "0")) println(output, "Chr\t.\tORF\t", orf.first, "\t", orf.last, "\t.\t", orf.strand, "\t.\tID=", id, ";Name=", id, ";Frame=", orf.frame) - end + end end function write_orfs_gff( - input::NucleicSeqOrView{A}, - output::String, - method::M; + input::Union{String, NucleicSeqOrView{DNAAlphabet{N}}}, + output::String; + finder::Type{F}=NaiveFinder, alternative_start::Bool = false, - min_len::Int64 = 6, -) where {A, M<:GeneFinderMethod} - orfs = findorfs(input, method; alternative_start, min_len) + minlen::Int64 = 6, +) where {N, F<:GeneFinderMethod} + seq = typeof(input) === String ? fasta2bioseq(input)[1] : input + orfs = findorfs(seq; finder, alternative_start, minlen) norfs = length(orfs) padding = norfs < 10 ? length(string(norfs)) + 1 : length(string(norfs)) open(output, "w") do f @@ -344,45 +246,5 @@ function write_orfs_gff( end end -function write_orfs_gff( - input::String, - output::Union{IOStream, IOBuffer}, - method::M; - alternative_start::Bool = false, - min_len::Int64 = 6, -) where {M<:GeneFinderMethod} - input = fasta_to_dna(input)[1] - orfs = findorfs(input, method; alternative_start, min_len) - norfs = length(orfs) - padding = norfs < 10 ? length(string(norfs)) + 1 : length(string(norfs)) - println(output, "##gff-version 3\n##sequence-region Chr 1 $(length(input))") - for (i, orf) in enumerate(orfs) - id = string("ORF", lpad(string(i), padding, "0")) - println(output, "Chr\t.\tORF\t", orf.first, "\t", orf.last, "\t.\t", orf.strand, "\t.\tID=", id, ";Name=", id, ";Frame=", orf.frame) - end -end - -function write_orfs_gff( - input::String, - output::String, - method::M; - alternative_start::Bool = false, - min_len::Int64 = 6, -) where {M<:GeneFinderMethod} - input = fasta_to_dna(input)[1] - orfs = findorfs(input, method; alternative_start, min_len) - norfs = length(orfs) - padding = norfs < 10 ? length(string(norfs)) + 1 : length(string(norfs)) - open(output, "w") do f - write(f, "##gff-version 3\n##sequence-region Chr 1 $(length(input))\n") - for (i, orf) in enumerate(orfs) - id = string("ORF", lpad(string(i), padding, "0")) - write( - f, - "Chr\t.\tORF\t$(orf.first)\t$(orf.last)\t.\t$(orf.strand)\t.\tID=$(id);Name=$(id);Frame=$(orf.frame)\n", - ) - end - end -end # Chr needs to be changed for fasta (?) or the name of seq (?) to fit well on IGV \ No newline at end of file diff --git a/src/iscoding.jl b/src/iscoding.jl index a0d31c8..982efbc 100644 --- a/src/iscoding.jl +++ b/src/iscoding.jl @@ -29,10 +29,10 @@ end function iscoding( sequence::NucleicSeqOrView{DNAAlphabet{N}}, - orf::ORF{M}; + orf::ORF{N,F}; criteria::Function = lordr, kwargs... -) where {N,M <: GeneFinderMethod} +) where {N,F<:GeneFinderMethod} return criteria(sequence[orf]; kwargs...) end diff --git a/src/utils.jl b/src/utils.jl index 404bf9f..27bf9f3 100644 --- a/src/utils.jl +++ b/src/utils.jl @@ -1,4 +1,4 @@ -export hasprematurestop, fasta_to_dna, get_variable_name +export hasprematurestop, fasta2bioseq, get_var_name # General purposes methods supporting main functions """ @@ -27,17 +27,17 @@ function hasprematurestop(sequence::NucleicSeqOrView{DNAAlphabet{N}})::Bool wher end """ - fasta_to_dna(input::String) + fasta2bioseq(input::String) Converts a FASTA formatted file (even if it is a multi-fasta) to an array of `LongSequence{DNAAlphabet{4}}` objects. """ -function fasta_to_dna(input::AbstractString)::Vector{LongSequence{DNAAlphabet{4}}} +function fasta2bioseq(input::AbstractString)::Vector{LongSequence{DNAAlphabet{4}}} FASTAReader(open(input)) do reader return [LongSequence{DNAAlphabet{4}}(sequence(record)) for record in reader] end end -function get_variable_name(var) +function get_var_name(var::NucleicSeqOrView{DNAAlphabet{N}}) where {N} for name in names(Main) if getfield(Main, name) === var return string(name) diff --git a/test/findorfstest.jl b/test/findorfstest.jl index 2eb9081..3596075 100644 --- a/test/findorfstest.jl +++ b/test/findorfstest.jl @@ -21,7 +21,7 @@ # # 2) A common start codon sequence at 426:590 # # On the other hand, the NCBI ORFfinder program predicts 9 ORFs whose length is greater than 75 nt, from which one has an "outbound" stop # seq03 = dna"TTCGTCAGTCGTTCTGTTTCATTCAATACGATAGTAATGTATTTTTCGTGCATTTCCGGTGGAATCGTGCCGTCCAGCATAGCCTCCAGATATCCCCTTATAGAGGTCAGAGGGGAACGGAAATCGTGGGATACATTGGCTACAAACTTTTTCTGATCATCCTCGGAACGGGCAATTTCGCTTGCCATATAATTCAGACAGGAAGCCAGATAACCGATTTCATCCTCACTATCGACCTGAAATTCATAATGCATATTACCGGCAGCATACTGCTCTGTGGCATGAGTGATCTTCCTCAGAGGAATATATACGATCTCAGTGAAAAAGATCAGAATGATCAGGGATAGCAGGAACAGGATTGCCAGGGTGATATAGGAAATATTCAGCAGGTTGTTACAGGATTTCTGAATATCATTCATATCAGTATGGATGACTACATAGCCTTTTACCTTGTAGTTGGAGGTAATGGGAGCAAATACAGTAAGTACATCCGAATCAAAATTACCGAAGAAATCACCAACAATGTAATAGGAGCCGCTGGTTACGGTCGAATCAAAATTCTCAATGACAACCACATTCTCCACATCTAAGGGACTATTGGTATCCAGTACCAGTCGTCCGGAGGGATTGATGATGCGAATCTCGGAATTCAGGTAGACCGCCAGGGAGTCCAGCTGCATTTTAACGGTCTCCAAAGTTGTTTCACTGGTGTACAATCCGCCGGCATAGGTTCCGGCGATCAGGGTTGCTTCGGAATAGAGACTTTCTGCCTTTTCCCGGATCAGATGTTCTTTGGTCATATTGGGAACAAAAGTTGTAACAATGATGAAACCAAATACACCAAAAATAAAATATGCGAGTATAAATTTTAGATAAAGTGTTTTTTTCATAACAAATCCTGCTTTTGGTATGACTTAATTACGTACTTCGAATTTATAGCCGATGCCCCAGATGGTGCTGATCTTCCAGTTGGCATGATCCTTGATCTTCTC" -# orfs03 = findorfs(seq03, NaiveFinder(), min_len=75) +# orfs03 = findorfs(seq03, NaiveFinder(), minlen=75) # @test length(orfs03) == 9 # @test orfs03 == [ORF{NaiveFinder}(37:156, '+', 1, nothing), ORF{NaiveFinder}(194:268, '-', 2, nothing), ORF{NaiveFinder}(194:283, '-', 2, nothing), ORF{NaiveFinder}(249:347, '+', 3, nothing), ORF{NaiveFinder}(426:590, '+', 3, nothing), ORF{NaiveFinder}(565:657, '+', 1, nothing), ORF{NaiveFinder}(650:727, '-', 2, nothing), ORF{NaiveFinder}(786:872, '+', 3, nothing), ORF{NaiveFinder}(887:976, '-', 2, nothing)] # #|-> This occured in Pyrodigal @@ -33,10 +33,11 @@ # # findorfs (GeneFinder.jl) --> 885 # # NCBI ORFfinder --> 375 ORFs # # orfipy --> 375 (`orfipy NC_001416.1.fasta --start ATG --include-stop --min 75`) -# NC_001416 = fasta_to_dna("data/NC_001416.1.fasta")[1] -# NC_001416_orfs = findorfs(NC_001416, NaiveFinder(), min_len=75) +# NC_001416 = fasta2bioseq("data/NC_001416.1.fasta")[1] +# NC_001416_orfs = findorfs(NC_001416, NaiveFinder(), minlen=75) # @test length(NC_001416_orfs) == 885 # end + # @testitem "getorfs dna" default_imports=false begin # using BioSequences: @dna_str, DNAAlphabet diff --git a/test/getindextest.jl b/test/getindextest.jl index 3ab5f71..cf57df8 100644 --- a/test/getindextest.jl +++ b/test/getindextest.jl @@ -1,16 +1,16 @@ # Import the function to be tested # Run the tests -@testitem "getindex" default_imports=false begin +# @testitem "getindex" default_imports=false begin - using BioSequences: @dna_str - using GeneFinder: ORF, NaiveFinder - using Test: @test +# using BioSequences: @dna_str +# using GeneFinder: ORF, NaiveFinder +# using Test: @test - seq01 = dna"ATGATGCATGCATGCATGCTAGTAACTAGCTAGCTAGCTAGTAA" +# seq01 = dna"ATGATGCATGCATGCATGCTAGTAACTAGCTAGCTAGCTAGTAA" - orfs01 = [ORF("seq01", 1:33, '+', 1, NaiveFinder()), ORF("seq01", 4:33, '+', 1, NaiveFinder()), ORF("seq01", 8:22, '+', 2, NaiveFinder()), ORF("seq01", 12:29, '+', 3, NaiveFinder()), ORF("seq01", 16:33, '+', 1, NaiveFinder())] +# orfs01 = [ORF("seq01", 1:33, '+', 1, NaiveFinder()), ORF("seq01", 4:33, '+', 1, NaiveFinder()), ORF("seq01", 8:22, '+', 2, NaiveFinder()), ORF("seq01", 12:29, '+', 3, NaiveFinder()), ORF("seq01", 16:33, '+', 1, NaiveFinder())] - @test getindex(seq01, orfs01[1]) == dna"ATGATGCATGCATGCATGCTAGTAACTAGCTAG" - @test getindex(seq01, orfs01[2]) == dna"ATGCATGCATGCATGCTAGTAACTAGCTAG" -end \ No newline at end of file +# @test getindex(seq01, orfs01[1]) == dna"ATGATGCATGCATGCATGCTAGTAACTAGCTAG" +# @test getindex(seq01, orfs01[2]) == dna"ATGCATGCATGCATGCTAGTAACTAGCTAG" +# end \ No newline at end of file diff --git a/test/runtests.jl b/test/runtests.jl index 34f222c..2900ce2 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -3,7 +3,7 @@ module GeneFinderTests using Test using BioSequences using FASTX -# using GeneFinder +using GeneFinder using Aqua using TestItems using TestItemRunner @@ -12,7 +12,7 @@ using TestItemRunner # include("findorfstest.jl") # include("iotest.jl") -include("getindextest.jl") +# include("getindextest.jl") # include("aquatest.jl") end From 43365420f04286f671dda16ce7b5ff3740bad4b0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Camilo=20Garc=C3=ADa?= Date: Mon, 3 Jun 2024 15:01:49 -0500 Subject: [PATCH 12/40] Remove unused function --- src/findorfs.jl | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/findorfs.jl b/src/findorfs.jl index 9d093a9..84d9819 100644 --- a/src/findorfs.jl +++ b/src/findorfs.jl @@ -33,8 +33,6 @@ findorfs(sequence, NaiveFinder()) ``` """ -function findorfs end - function findorfs( sequence::NucleicSeqOrView{DNAAlphabet{N}}; finder::Type{F}=NaiveFinder, From 051650863717140a853d7ba90882d5c47abb2b3a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Camilo=20Garc=C3=ADa?= Date: Mon, 3 Jun 2024 15:02:09 -0500 Subject: [PATCH 13/40] Update deps --- src/GeneFinder.jl | 2 +- src/algorithms/naivecollector.jl | 14 ++-- src/algorithms/naivefinder.jl | 139 +++++++++++++++++-------------- src/types.jl | 8 +- src/utils.jl | 1 + 5 files changed, 87 insertions(+), 77 deletions(-) diff --git a/src/GeneFinder.jl b/src/GeneFinder.jl index 94df7d6..cbc4834 100644 --- a/src/GeneFinder.jl +++ b/src/GeneFinder.jl @@ -27,7 +27,7 @@ using BioMarkovChains: BioMarkovChain, dnaseqprobability, ECOLICDS, ECOLINOCDS, using FASTX: FASTAReader, sequence using IterTools: takewhile, iterated using PrecompileTools: @setup_workload, @compile_workload -using GenomicFeatures: GenomicFeatures, AbstractGenomicInterval, GenomicInterval, Strand, summary, groupname, strand, metadata, STRAND_POS, STRAND_NEG, STRAND_BOTH, STRAND_NA +using GenomicFeatures: GenomicFeatures, AbstractGenomicInterval, GenomicInterval, Strand, summary, groupname, strand, metadata, STRAND_POS, STRAND_NEG, STRAND_BOTH, STRAND_NA, leftposition, rightposition, length # Finder Algorithms include("algorithms/naivefinder.jl") diff --git a/src/algorithms/naivecollector.jl b/src/algorithms/naivecollector.jl index 4efb608..f733e67 100644 --- a/src/algorithms/naivecollector.jl +++ b/src/algorithms/naivecollector.jl @@ -18,7 +18,7 @@ function NaiveCollector( seqlen = length(sequence) seqname = get_var_name(sequence) - function createorfs(x, strand, s) + function createorfs(x, strand) if length(x.captured[1]:x.captured[3]) < minlen return nothing end @@ -31,17 +31,17 @@ function NaiveCollector( stop = seqlen - x.captured[1] + 1 frame = framedict[(seqlen - x.captured[3]) % 3] end - seq = @view(sequence[start:stop]) - score = scheme === nothing ? 0 : scheme(@view(s[start:stop]), ECOLICDS) - fts = Dict(:gc => gc_content(seq), :length => length(seq), :score => score) + seq = strand == '+' ? @view(sequence[start:stop]) : reverse_complement(@view(sequence[start:stop])) #@view(sequence[start:stop]) + score = scheme === nothing ? 0.0 : scheme(@view(seq[start:stop]), ECOLICDS) + fts = Dict(:score => score) return ORF{N,NaiveCollector}(seqname, start, stop, strand, frame, seq, fts, scheme) end orfs = Vector{ORF{N,NaiveCollector}}() for strand in ('+', '-') - s = strand == '-' ? revseq : sequence - matches = eachmatch(regorf, s, overlap) - strandedorfs = filter(!isnothing, [createorfs(x, strand, s) for x in matches]) + seq = strand == '-' ? revseq : sequence + matches = eachmatch(regorf, seq, overlap) + strandedorfs = filter(!isnothing, [createorfs(x, strand) for x in matches]) append!(orfs, strandedorfs) end diff --git a/src/algorithms/naivefinder.jl b/src/algorithms/naivefinder.jl index 2c7cbed..d78c20e 100644 --- a/src/algorithms/naivefinder.jl +++ b/src/algorithms/naivefinder.jl @@ -62,82 +62,95 @@ The `naivefinder` function takes a LongSequence{DNAAlphabet{4}} sequence and ret If the log-odds ratio exceeds a given threshold (`η`), the sequence is considered likely to be coding. See [`lordr`](@ref) for more information about coding creteria. """ -# function NaiveFinder( -# sequence::NucleicSeqOrView{DNAAlphabet{N}}; -# alternative_start::Bool = false, -# minlen::Int64 = 6, -# scheme::Union{Nothing, Function} = nothing, -# kwargs... -# ) where {N} -# seqlen = length(sequence) -# framedict = Dict(0 => 3, 1 => 1, 2 => 2) -# orfs = Vector{ORF{N,NaiveFinder}}() -# seqname = get_var_name(sequence) -# for strand in ('+', '-') -# s = strand == '-' ? reverse_complement(sequence) : sequence -# @inbounds for location in @views _locationiterator(s; alternative_start) -# if length(location) >= minlen -# start = strand == '+' ? location.start : seqlen - location.stop + 1 -# stop = start + length(location) - 1 -# frame = strand == '+' ? framedict[location.start % 3] : framedict[(seqlen - location.stop + 1) % 3] -# seq = @view(sequence[start:stop]) -# score = scheme === nothing ? 0 : scheme(@view(s[start:stop]), ECOLICDS) -# gc = gc_content(seq) -# len = length(seq) -# fts = Dict(:gc => gc, :length => len, :score => score) -# push!(orfs, ORF{N,NaiveFinder}(seqname, start, stop, strand, frame, NaiveFinder, seq, fts, scheme)) -# end -# end -# end -# return sort!(orfs; kwargs...) -# end - function NaiveFinder( sequence::NucleicSeqOrView{DNAAlphabet{N}}; alternative_start::Bool = false, minlen::Int64 = 6, - scheme::Union{Nothing, Function} = nothing, + scheme::Union{Nothing,Function} = nothing, kwargs... ) where {N} - seqlen = length(sequence) framedict = Dict(0 => 3, 1 => 1, 2 => 2) orfs = Vector{ORF{N,NaiveFinder}}() seqname = get_var_name(sequence) - - function createorf(location, strand, s, seqlen) - if length(location) < minlen - return nothing - end - - if strand == '+' - start = location.start - stop = start + length(location) - 1 - frame = framedict[location.start % 3] - else - start = seqlen - location.stop + 1 - stop = start + length(location) - 1 - frame = framedict[(seqlen - location.stop + 1) % 3] - end - - seq = @view(sequence[start:stop]) - score = scheme === nothing ? 0 : scheme(@view(s[start:stop]), ECOLICDS) - gc = gc_content(seq) - len = length(seq) - fts = Dict(:gc => gc, :length => len, :score => score) - - return ORF{N, NaiveFinder}(seqname, start, stop, strand, frame, seq, fts, scheme) - end - - @inbounds for strand in ('+', '-') + for strand in ('+', '-') s = strand == '-' ? reverse_complement(sequence) : sequence @inbounds for location in @views _locationiterator(s; alternative_start) - orf = createorf(location, strand, s, seqlen) - if orf !== nothing - push!(orfs, orf) + if length(location) >= minlen + #main fields + start = strand == '+' ? location.start : seqlen - location.stop + 1 + stop = start + length(location) - 1 + frame = strand == '+' ? framedict[location.start % 3] : framedict[(seqlen - location.stop + 1) % 3] + seq = strand == '+' ? @view(sequence[start:stop]) : reverse_complement(@view(sequence[start:stop])) #@view(sequence[start:stop]) + #features + score = scheme === nothing ? 0.0 : scheme(@view(seq[start:stop]), ECOLICDS) + + #populate the feature dict + fts = Dict(:score => score) #:gc => gc, :length => len, + + push!(orfs, ORF{N,NaiveFinder}(seqname, start, stop, strand, frame, seq, fts, scheme)) end end end - return sort!(orfs; kwargs...) -end \ No newline at end of file +end + +# export createorf + +# function createorf( +# sequence::NucleicSeqOrView{DNAAlphabet{N}}, +# seqname::String, +# location::UnitRange{Int64}, +# strand::Char, +# seqlen::Int64, +# minlen::Int64, +# scheme::Union{Nothing,Function} +# ) where {N} +# if length(location) < minlen +# return nothing +# end + +# framedict = Dict(0 => 3, 1 => 1, 2 => 2) + +# if strand == '+' +# start = location.start +# stop = start + length(location) - 1 +# frame = framedict[location.start % 3] +# else +# start = seqlen - location.stop + 1 +# stop = start + length(location) - 1 +# frame = framedict[(seqlen - location.stop + 1) % 3] +# end + +# seq = strand == '+' ? @view(sequence[start:stop]) : reverse_complement(@view(sequence[start:stop])) +# score = scheme === nothing ? 0 : scheme(@view(seq[start:stop]), ECOLICDS) +# gc = gc_content(seq) +# len = length(seq) +# fts = Dict(:gc => gc, :length => len, :score => score) + +# return ORF{N, NaiveFinder}(seqname, start, stop, strand, frame, seq, fts, scheme) +# end + +# function NaiveFinder( +# sequence::NucleicSeqOrView{DNAAlphabet{N}}; +# alternative_start::Bool = false, +# minlen::Int64 = 6, +# scheme::Union{Nothing,Function} = nothing, +# kwargs... +# ) where {N} +# seqlen = length(sequence) +# orfs = Vector{ORF{N,NaiveFinder}}() +# seqname = get_var_name(sequence) + +# @inbounds for strand in ('+', '-') +# seq = strand == '-' ? reverse_complement(sequence) : sequence +# @inbounds for location in @views _locationiterator(s; alternative_start) +# orf = createorf(seq, seqname, location, strand, seqlen, minlen, scheme) +# if orf !== nothing +# push!(orfs, orf) +# end +# end +# end + +# return sort!(orfs; kwargs...) +# end \ No newline at end of file diff --git a/src/types.jl b/src/types.jl index 2aa52a4..9e4ec85 100644 --- a/src/types.jl +++ b/src/types.jl @@ -73,19 +73,15 @@ function scheme(i::ORF{N,F}) where {N,F} end function sequence(i::ORF{N,F}) where {N,F} - return i.strand == STRAND_POS ? i.seq : reverse_complement(i.seq) + return i.seq #i.strand == STRAND_POS ? i.seq : reverse_complement(i.seq) end function score(i::ORF{N,F}) where {N,F} return i.features[:score] end -function gc(i::ORF{N,F}) where {N,F} - return i.features[:gc] -end - function length(i::ORF{N,F}) where {N,F} - return i.features[:length] + return length(i.seq) end function features(i::ORF{N,F}) where {N,F} diff --git a/src/utils.jl b/src/utils.jl index 27bf9f3..f40a09f 100644 --- a/src/utils.jl +++ b/src/utils.jl @@ -1,4 +1,5 @@ export hasprematurestop, fasta2bioseq, get_var_name + # General purposes methods supporting main functions """ From f17a1f5dac629230a3123d69de4c106239c44083 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Camilo=20Garc=C3=ADa?= Date: Tue, 4 Jun 2024 21:43:02 -0500 Subject: [PATCH 14/40] Refactor finders with correct score calculation --- src/algorithms/naivecollector.jl | 2 +- src/algorithms/naivefinder.jl | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/algorithms/naivecollector.jl b/src/algorithms/naivecollector.jl index f733e67..0377de1 100644 --- a/src/algorithms/naivecollector.jl +++ b/src/algorithms/naivecollector.jl @@ -32,7 +32,7 @@ function NaiveCollector( frame = framedict[(seqlen - x.captured[3]) % 3] end seq = strand == '+' ? @view(sequence[start:stop]) : reverse_complement(@view(sequence[start:stop])) #@view(sequence[start:stop]) - score = scheme === nothing ? 0.0 : scheme(@view(seq[start:stop]), ECOLICDS) + score = scheme === nothing ? 0.0 : scheme(seq, ECOLICDS) fts = Dict(:score => score) return ORF{N,NaiveCollector}(seqname, start, stop, strand, frame, seq, fts, scheme) end diff --git a/src/algorithms/naivefinder.jl b/src/algorithms/naivefinder.jl index d78c20e..8fa92d9 100644 --- a/src/algorithms/naivefinder.jl +++ b/src/algorithms/naivefinder.jl @@ -83,7 +83,7 @@ function NaiveFinder( frame = strand == '+' ? framedict[location.start % 3] : framedict[(seqlen - location.stop + 1) % 3] seq = strand == '+' ? @view(sequence[start:stop]) : reverse_complement(@view(sequence[start:stop])) #@view(sequence[start:stop]) #features - score = scheme === nothing ? 0.0 : scheme(@view(seq[start:stop]), ECOLICDS) + score = scheme === nothing ? 0.0 : scheme(seq, ECOLICDS) # @view(sequence[start:stop] #populate the feature dict fts = Dict(:score => score) #:gc => gc, :length => len, From 41e509b35f27c4e45092c6ca1e2d955541867b24 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Camilo=20Garc=C3=ADa?= Date: Wed, 5 Jun 2024 00:07:49 -0500 Subject: [PATCH 15/40] chore: Update Julia version to 1.10.4 --- Manifest.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Manifest.toml b/Manifest.toml index 0cc8efb..49d18d7 100644 --- a/Manifest.toml +++ b/Manifest.toml @@ -1,6 +1,6 @@ # This file is machine-generated - editing it directly is not advised -julia_version = "1.10.3" +julia_version = "1.10.4" manifest_format = "2.0" project_hash = "514456eec9673906fe896a3ae36685951a71c19f" From 140b672085eb4dfe5a7c14a33105b464772399a2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Camilo=20Garc=C3=ADa?= Date: Sat, 8 Jun 2024 16:59:32 -0500 Subject: [PATCH 16/40] Update README --- README.md | 56 +++++++++++++++++++++++++++++++++++++------------------ 1 file changed, 38 insertions(+), 18 deletions(-) diff --git a/README.md b/README.md index 33e7b82..be45347 100644 --- a/README.md +++ b/README.md @@ -50,27 +50,27 @@ seq = dna"AACCAGGGCAATATCAGTACCGCGGGCAATGCAACCCTGACTGCCGGCGGTAACCTGAACAGCACTGGCA Now lest us find the ORFs ```julia -findorfs(seq, NaiveFinder()) - -12-element Vector{ORF}: - ORF(29:40, '+', 2, 0.0) - ORF(137:145, '+', 2, 0.0) - ORF(164:184, '+', 2, 0.0) - ORF(173:184, '+', 2, 0.0) - ORF(236:241, '+', 2, 0.0) - ORF(248:268, '+', 2, 0.0) - ORF(362:373, '+', 2, 0.0) - ORF(470:496, '+', 2, 0.0) - ORF(551:574, '+', 2, 0.0) - ORF(569:574, '+', 2, 0.0) - ORF(581:601, '+', 2, 0.0) - ORF(695:706, '+', 2, 0.0) +orfs = findorfs(seq, finder=NaiveFinder) # use finder=NaiveCollector as an alternative + +12-element Vector{ORF{4, NaiveFinder}}: + ORF{NaiveFinder}(29:40, '+', 2) + ORF{NaiveFinder}(137:145, '+', 2) + ORF{NaiveFinder}(164:184, '+', 2) + ORF{NaiveFinder}(173:184, '+', 2) + ORF{NaiveFinder}(236:241, '+', 2) + ORF{NaiveFinder}(248:268, '+', 2) + ORF{NaiveFinder}(362:373, '+', 2) + ORF{NaiveFinder}(470:496, '+', 2) + ORF{NaiveFinder}(551:574, '+', 2) + ORF{NaiveFinder}(569:574, '+', 2) + ORF{NaiveFinder}(581:601, '+', 2) + ORF{NaiveFinder}(695:706, '+', 2) ``` -Two other methods where implemented into `getorfs` to get the ORFs in DNA or aminoacid sequences, respectively. They use the `findorfs` function to first get the ORFs and then get the correspondance array of `BioSequence` objects. +The `ORF` structure contains also the sequence of the ORF, the frame, and several features. If you want to get the sequence of the ORFs you can broadcast the `sequence` function to safely extract the sequences of all ORFs. ```julia -getorfs(seq, DNAAlphabet{4}(), NaiveFinder()) +sequence.(orfs) 12-element Vector{LongSubSeq{DNAAlphabet{4}}}: ATGCAACCCTGA @@ -87,6 +87,26 @@ getorfs(seq, DNAAlphabet{4}(), NaiveFinder()) ATGCAACCCTGA ``` +Similarly, you can extract the amino acid sequences of the ORFs using the `translate` function. + +```julia +translate.(orfs) + +12-element Vector{LongAA}: + MQP* + MR* + MRRMAR* + MAR* + M* + MCPTAV* + MQP* + MHWLVLSI* + MSPHKAM* + M* + MCPTAA* + MQP* +``` + ## Writting ORF information into bioinformatic formats This package facilitates now the creation of `FASTA`, `BED`, and `GFF` files, specifically extracting Open Reading Frame (ORF) information from `BioSequence` instances, particularly those of type `NucleicSeqOrView{A} where A`, and then writing the information into the desired format. @@ -118,7 +138,7 @@ Once a `BioSequence` object has been instantiated, the `write_orfs_fna` function outfile = "LFLS01000089.fna" open(outfile, "w") do io - write_orfs_fna(seq, io, NaiveFinder()) + write_orfs_fna(seq, io, finder=NaiveFinder) # use finder=NaiveCollector as an alternative end ``` From 8cbd9611ebf1d3aca5cc16c55d8fe7d17708544f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Camilo=20Garc=C3=ADa?= Date: Sat, 8 Jun 2024 17:01:08 -0500 Subject: [PATCH 17/40] Manifest update --- Manifest.toml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Manifest.toml b/Manifest.toml index 49d18d7..a00ab9a 100644 --- a/Manifest.toml +++ b/Manifest.toml @@ -132,9 +132,9 @@ uuid = "ea8e919c-243c-51af-8825-aaa63cd721ce" version = "0.7.0" [[deps.StaticArraysCore]] -git-tree-sha1 = "36b3d696ce6366023a0ea192b4cd442268995a0d" +git-tree-sha1 = "192954ef1208c7019899fbf8049e717f92959682" uuid = "1e83bf80-4336-4d27-bf5d-d5a4f845583c" -version = "1.4.2" +version = "1.4.3" [[deps.StringViews]] git-tree-sha1 = "f7b06677eae2571c888fd686ba88047d8738b0e3" From 56d3f1c2721135359faebacc46b1476518351b86 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Camilo=20Garc=C3=ADa?= Date: Sat, 8 Jun 2024 17:02:06 -0500 Subject: [PATCH 18/40] Create Features struct and include it in the ORF as the NamedTuple field --- src/types.jl | 25 ++++++++++++++++++------- 1 file changed, 18 insertions(+), 7 deletions(-) diff --git a/src/types.jl b/src/types.jl index 9e4ec85..f9d1755 100644 --- a/src/types.jl +++ b/src/types.jl @@ -1,8 +1,8 @@ -import GenomicFeatures: first, last, length, strand, groupname +import GenomicFeatures: first, last, length, strand, groupname, metadata import FASTX: sequence -export RBS, ORF -export gc, features, sequence +export Features, RBS, ORF +export features, sequence export groupname, finder, frame, scheme, score #### Ribosome Binding Site (RBS) struct #### @@ -10,6 +10,12 @@ export groupname, finder, frame, scheme, score # seq[orf.first-bin01.offset.start:orf.first-1] # motifs = [dna"GGA", dna"GAG", dna"AGG"] + +struct Features{K} + fts::K +end + +Features(fts::NamedTuple) = Features{typeof(fts)}(fts) struct RBS motif::BioRegex{DNA} offset::UnitRange{Int64} # offset @@ -20,8 +26,8 @@ struct RBS end end -# Structs associated with gene models +# Structs associated with gene models # const FEATUREDICT = Dict{Symbol,Any}(:gc => 0.0, :length => 0, :score => 0.0) # const Feature = Union{Real, Vector{RBS}} struct ORF{N,F} <: GenomicFeatures.AbstractGenomicInterval{F} @@ -31,7 +37,7 @@ struct ORF{N,F} <: GenomicFeatures.AbstractGenomicInterval{F} strand::Strand frame::Int seq::LongSubSeq{DNAAlphabet{N}} - features::Dict{Symbol,Any} + features::Features # features::Dict{Symbol,Any} or # features::@NamedTuple{score::Float64, rbs::Any} scheme::Union{Nothing,Function} end @@ -43,10 +49,11 @@ function ORF{N,F}( strand::Union{Strand,Char}, frame::Int, seq::LongSubSeq{DNAAlphabet{N}}, - features::Dict{Symbol,Any}, + features::Features, # ::Dict{Symbol,Any} or # ::@NamedTuple{score::Float64, rbs::Any} or @NamedTuple{Vararg{typeof(...)}} scheme::Union{Nothing,Function}=nothing ) where {N,F<:GeneFinderMethod} # @assert frame in (1, 2, 3) && location[1] < location[end] && score isa Union{Nothing, Float64} && length(seq) == last(location) - first(location) + 1 && length(seq) % 3 == 0 "Invalid input. Please check the frame, location, score, and sequence length." + # groupname == "" && error("Please provide a groupname for the ORF.") return ORF{N,F}(groupname, first, last, strand, frame, seq, features, scheme) #finder end @@ -85,7 +92,7 @@ function length(i::ORF{N,F}) where {N,F} end function features(i::ORF{N,F}) where {N,F} - return i.features + return i.features.fts end function Base.show(io::IO, i::ORF{N,F}) where {N,F} @@ -96,6 +103,10 @@ function Base.show(io::IO, i::ORF{N,F}) where {N,F} end end +function metadata(i::ORF{N,F}) where {N,F} + return features(i) +end + ## Ideas for Gene struct # struct CDS <: AbstractGene From fd839c99402978f2bdc1a5514277dcd5f6f4c415 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Camilo=20Garc=C3=ADa?= Date: Sat, 8 Jun 2024 17:02:21 -0500 Subject: [PATCH 19/40] Refactor code to improve precompile file size and loading speed --- src/GeneFinder.jl | 13 +------------ src/workload.jl | 15 +++++++++++++++ 2 files changed, 16 insertions(+), 12 deletions(-) create mode 100644 src/workload.jl diff --git a/src/GeneFinder.jl b/src/GeneFinder.jl index cbc4834..467c120 100644 --- a/src/GeneFinder.jl +++ b/src/GeneFinder.jl @@ -26,7 +26,6 @@ using BioSequences: using BioMarkovChains: BioMarkovChain, dnaseqprobability, ECOLICDS, ECOLINOCDS, log_odds_ratio_score using FASTX: FASTAReader, sequence using IterTools: takewhile, iterated -using PrecompileTools: @setup_workload, @compile_workload using GenomicFeatures: GenomicFeatures, AbstractGenomicInterval, GenomicInterval, Strand, summary, groupname, strand, metadata, STRAND_POS, STRAND_NEG, STRAND_BOTH, STRAND_NA, leftposition, rightposition, length # Finder Algorithms @@ -46,16 +45,6 @@ include("io.jl") include("utils.jl") include("extended.jl") -@setup_workload begin - # Putting some things in `@setup_workload` instead of `@compile_workload` can reduce the size of the - # precompile file and potentially make loading faster. - # using BioSequences, GeneFinder - seq = randdnaseq(99) - @compile_workload begin - # all calls in this block will be precompiled, regardless of whether - # they belong to your package or not (on Julia 1.8 and higher) - # findorfs(seq) - end -end +include("workload.jl") end \ No newline at end of file diff --git a/src/workload.jl b/src/workload.jl new file mode 100644 index 0000000..203879e --- /dev/null +++ b/src/workload.jl @@ -0,0 +1,15 @@ +using PrecompileTools: @setup_workload, @compile_workload + +@setup_workload begin + # Putting some things in `@setup_workload` instead of `@compile_workload` can reduce the size of the + # precompile file and potentially make loading faster. + # using BioSequences, GeneFinder + seq = randdnaseq(99) + @compile_workload begin + _locationiterator(seq; alternative_start=false) + # findorfs(seq) + # all calls in this block will be precompiled, regardless of whether + # they belong to your package or not (on Julia 1.8 and higher) + # findorfs(seq) + end +end \ No newline at end of file From d0cf75477a19909b96266af14aaf0994268171f3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Camilo=20Garc=C3=ADa?= Date: Sat, 8 Jun 2024 17:02:42 -0500 Subject: [PATCH 20/40] Update io methods --- src/io.jl | 34 +++++++++++++++++----------------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/src/io.jl b/src/io.jl index 8924eaf..a67730e 100644 --- a/src/io.jl +++ b/src/io.jl @@ -20,9 +20,9 @@ Write BED data to a file. function write_orfs_bed( input::Union{String,NucleicSeqOrView{DNAAlphabet{N}}}, output::Union{IOStream, IOBuffer}; - finder::Type{F}=NaiveFinder, + finder::Type{F} = NaiveFinder, alternative_start::Bool = false, - minlen::Int64 = 6 + minlen::Int64 = 6, ) where {N,F<:GeneFinderMethod} sequence = typeof(input) === String ? fasta2bioseq(input)[1] : input orfs = findorfs(sequence; finder, alternative_start, minlen) @@ -34,10 +34,10 @@ end function write_orfs_bed( input::Union{String,NucleicSeqOrView{DNAAlphabet{N}}}, output::String; - finder::Type{F}=NaiveFinder, + finder::Type{F} = NaiveFinder, alternative_start::Bool = false, - minlen::Int64 = 6 -) where {N, F<:GeneFinderMethod} + minlen::Int64 = 6, +) where {N,F<:GeneFinderMethod} sequence = typeof(input) === String ? fasta2bioseq(input)[1] : input orfs = findorfs(sequence; finder, alternative_start, minlen) open(output, "w") do f @@ -81,10 +81,10 @@ end function write_orfs_fna( input::Union{String, NucleicSeqOrView{DNAAlphabet{N}}}, output::Union{IOStream, IOBuffer}; - finder::Type{F}=NaiveFinder, + finder::Type{F} = NaiveFinder, alternative_start::Bool = false, - minlen::Int64 = 6 -) where {N, F<:GeneFinderMethod} + minlen::Int64 = 6, +) where {N,F<:GeneFinderMethod} seq = typeof(input) === String ? fasta2bioseq(input)[1] : input orfs = findorfs(seq; finder, alternative_start, minlen) norfs = length(orfs) @@ -99,10 +99,10 @@ end function write_orfs_fna( input::Union{String,NucleicSeqOrView{DNAAlphabet{N}}}, output::String; - finder::Type{F}=NaiveFinder, + finder::Type{F} = NaiveFinder, alternative_start::Bool = false, - minlen::Int64 = 6 -) where {N, F<:GeneFinderMethod} + minlen::Int64 = 6, +) where {N,F<:GeneFinderMethod} seq = typeof(input) === String ? fasta2bioseq(input)[1] : input orfs = findorfs(seq; finder, alternative_start, minlen) norfs = length(orfs) @@ -149,10 +149,10 @@ end function write_orfs_faa( input::Union{String, NucleicSeqOrView{DNAAlphabet{N}}}, output::Union{IOStream, IOBuffer}; - finder::Type{F}=NaiveFinder, + finder::Type{F} = NaiveFinder, alternative_start::Bool = false, minlen::Int64 = 6, - code::GeneticCode = ncbi_trans_table[1] + code::GeneticCode = ncbi_trans_table[1], ) where {N,F<:GeneFinderMethod} seq = typeof(input) === String ? fasta2bioseq(input)[1] : input orfs = findorfs(seq; finder, alternative_start, minlen) @@ -171,7 +171,7 @@ function write_orfs_faa( finder::Type{F}=NaiveFinder, alternative_start::Bool = false, minlen::Int64 = 6, - code::GeneticCode = ncbi_trans_table[1] + code::GeneticCode = ncbi_trans_table[1], ) where {N,F<:GeneFinderMethod} seq = typeof(input) === String ? fasta2bioseq(input)[1] : input orfs = findorfs(seq; finder, alternative_start, minlen) @@ -208,10 +208,10 @@ Write GFF data to a file. function write_orfs_gff( input::Union{String, NucleicSeqOrView{DNAAlphabet{N}}}, output::Union{IOStream, IOBuffer}; - finder::Type{F}=NaiveFinder, + finder::Type{F} = NaiveFinder, alternative_start::Bool = false, minlen::Int64 = 6, -) where {N, F<:GeneFinderMethod} +) where {N,F<:GeneFinderMethod} seq = typeof(input) === String ? fasta2bioseq(input)[1] : input orfs = findorfs(seq; finder, alternative_start, minlen) norfs = length(orfs) @@ -226,7 +226,7 @@ end function write_orfs_gff( input::Union{String, NucleicSeqOrView{DNAAlphabet{N}}}, output::String; - finder::Type{F}=NaiveFinder, + finder::Type{F} = NaiveFinder, alternative_start::Bool = false, minlen::Int64 = 6, ) where {N, F<:GeneFinderMethod} From 2ac4ce92a1741c445efc7a5210191061caa4a32d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Camilo=20Garc=C3=ADa?= Date: Sat, 8 Jun 2024 17:02:57 -0500 Subject: [PATCH 21/40] Remove getorfs.jl --- src/getorfs.jl | 131 ------------------------------------------------- 1 file changed, 131 deletions(-) delete mode 100644 src/getorfs.jl diff --git a/src/getorfs.jl b/src/getorfs.jl deleted file mode 100644 index c487aeb..0000000 --- a/src/getorfs.jl +++ /dev/null @@ -1,131 +0,0 @@ -# export getorfs - -# #### get_orfs_* methods #### - -# """ -# getorfs(sequence::NucleicSeqOrView{DNAAlphabet{N}}, ::Alphabet, method::M; kwargs...) where {N, M<:GeneFinderMethod} - -# This function takes a `NucleicSeqOrView{DNAAlphabet{N}}` sequence and identifies the open reading frames (ORFs) using the `findorfs()` function under the selected finder method. The function then extracts the DNA or aminoacid sequence of each ORF and stores it in a `Vector{LongSubSeq{A}}`. - -# # Arguments - -# - `sequence`: The input sequence as a `NucleicSeqOrView{DNAAlphabet{N}}` -# - `alphabet`: The output sequence alphabet. It can be either `DNAAlphabet{4}()` or `AminoAcidAlphabet()`. -# - `method`: The method used to find ORFs. It must be a subtype of `GeneFinderMethod`. (e.g., `NaiveFinder()`) - -# # Keyword Arguments - -# - `alternative_start::Bool=false`: If set to `true`, the function considers alternative start codons when searching for ORFs. This increases the execution time by approximately 3x. -# - `minlen::Int64=6`: The minimum length of the allowed ORF. By default, it allows ORFs that can encode at least one amino acid (e.g., `aa"M*"`). - -# # Examples - -# ```julia -# sequence = randdnaseq(120) - -# 120nt DNA Sequence: -# GCCGGACAGCGAAGGCTAATAAATGCCCGTGCCAGTATC…TCTGAGTTACTGTACACCCGAAAGACGTTGTACGCATTT - - -# getorfs(sequence, DNAAlphabet{4}(), NaiveFinder()) - -# 1-element Vector{LongSubSeq{DNAAlphabet{4}}}: -# ATGCGTACAACGTCTTTCGGGTGTACAGTAACTCAGAGCTAA - -# ``` -# """ -# function getorfs( -# seq::NucleicSeqOrView{DNAAlphabet{N}}, -# ::DNAAlphabet{N}; -# finder::Type{F}=NaiveFinder, -# alternative_start::Bool = false, -# minlen::Int64 = 6, -# kwargs... -# ) where {N,F<:GeneFinderMethod} -# orfs = findorfs(seq; finder, alternative_start, minlen) -# return sequence.(orfs) -# end - -# function getorfs( -# seq::NucleicSeqOrView{DNAAlphabet{N}}, -# ::AminoAcidAlphabet; -# finder::Type{F}=NaiveFinder, -# alternative_start::Bool = false, -# minlen::Int64 = 6, -# code::GeneticCode = ncbi_trans_table[1], -# kwargs... -# ) where {N,F<:GeneFinderMethod} -# orfs = findorfs(seq; finder, alternative_start, minlen) -# return @. translate(sequence(orfs); code) -# end - -# #### record_orfs_* methods #### - -# """ -# record_orfs_fna(sequence::NucleicSeqOrView{DNAAlphabet{N}}; kwargs...) where {N} - -# Record Open Reading Frames (ORFs) in a nucleic acid sequence. - -# # Arguments -# - `sequence::NucleicSeqOrView{DNAAlphabet{N}}`: The nucleic acid sequence to search for ORFs. -# - `alternative_start::Bool=false`: If `true`, consider alternative start codons for ORFs. -# - `minlen::Int=6`: The minimum length of an ORF to be recorded. - -# # Returns -# An array of `FASTARecord` objects representing the identified ORFs. - -# # Description -# This function searches for Open Reading Frames (ORFs) in a given nucleic acid sequence. An ORF is a sequence of DNA that starts with a start codon and ends with a stop codon, without any other stop codons in between. By default, only the standard start codon (ATG) is considered, but if `alternative_start` is set to `true`, alternative start codons are also considered. The minimum length of an ORF to be recorded can be specified using the `minlen` argument. -# """ -# # function record_orfs_fna( -# # sequence::NucleicSeqOrView{DNAAlphabet{N}}; -# # alternative_start::Bool = false, -# # minlen::Int64 = 6 -# # ) where {N} -# # orfs = findorfs(sequence; alternative_start, minlen) -# # norfs = length(orfs) -# # padding = norfs < 10 ? length(string(norfs)) + 1 : length(string(norfs)) -# # records = FASTARecord[] -# # @inbounds for (index, orf) in enumerate(orfs) -# # id = string(lpad(string(index), padding, "0")) -# # header = "ORF$(id) id=$(id) start=$(orf.location.start) stop=$(orf.location.stop) strand=$(orf.strand) frame=$(orf.frame)" -# # record = FASTARecord(header, sequence[orf]) -# # push!(records, record) -# # end -# # return records -# # end - -# """ -# record_orfs_faa(sequence::NucleicSeqOrView{DNAAlphabet{N}}; kwargs...) where {N} - -# This function takes a nucleic acid sequence and finds all open reading frames (ORFs) in the sequence. -# An ORF is a sequence of DNA that starts with a start codon and ends with a stop codon. -# The function returns a list of FASTA records, where each record represents an ORF in the sequence. - -# # Arguments -# - `sequence`: The nucleic acid sequence to search for ORFs. -# - `alternative_start`: A boolean indicating whether to consider alternative start codons. Default is `false`. -# - `code`: The genetic code to use for translation. Default is the NCBI translation table 1. -# - `minlen`: The minimum length of an ORF. ORFs shorter than this length will be ignored. Default is 6. - -# # Returns -# - A list of FASTA records representing the ORFs found in the sequence. -# """ -# # function record_orfs_faa( -# # sequence::NucleicSeqOrView{DNAAlphabet{N}}; -# # alternative_start::Bool = false, -# # code::GeneticCode = ncbi_trans_table[1], -# # minlen::Int64 = 6 -# # ) where {N} -# # orfs = findorfs(sequence; alternative_start, minlen) -# # norfs = length(orfs) -# # padding = norfs < 10 ? length(string(norfs)) + 1 : length(string(norfs)) -# # records = FASTARecord[] -# # @inbounds for (index, orf) in enumerate(orfs) -# # id = string(lpad(string(index), padding, "0")) -# # header = "ORF$(id) id=$(id) start=$(orf.location.start) stop=$(orf.location.stop) strand=$(orf.strand) frame=$(orf.frame)" -# # record = FASTARecord(header, translate(sequence[orf]; code)) -# # push!(records, record) -# # end -# # return records -# # end From ba9e75ecc159b3c69751721282ad01a6f560efe7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Camilo=20Garc=C3=ADa?= Date: Sat, 8 Jun 2024 17:03:27 -0500 Subject: [PATCH 22/40] Update finder methods with the Feature struct --- src/algorithms/naivecollector.jl | 5 +++-- src/algorithms/naivefinder.jl | 5 +++-- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/src/algorithms/naivecollector.jl b/src/algorithms/naivecollector.jl index 0377de1..024e705 100644 --- a/src/algorithms/naivecollector.jl +++ b/src/algorithms/naivecollector.jl @@ -32,8 +32,9 @@ function NaiveCollector( frame = framedict[(seqlen - x.captured[3]) % 3] end seq = strand == '+' ? @view(sequence[start:stop]) : reverse_complement(@view(sequence[start:stop])) #@view(sequence[start:stop]) - score = scheme === nothing ? 0.0 : scheme(seq, ECOLICDS) - fts = Dict(:score => score) + scr = scheme === nothing ? 0.0 : scheme(seq, ECOLICDS) + # fts = Dict(:score => score) + fts = Features((score = scr,)) # rbs = RBS(biore"RRR"dna, 3:4, 1.0) return ORF{N,NaiveCollector}(seqname, start, stop, strand, frame, seq, fts, scheme) end diff --git a/src/algorithms/naivefinder.jl b/src/algorithms/naivefinder.jl index 8fa92d9..9a06a96 100644 --- a/src/algorithms/naivefinder.jl +++ b/src/algorithms/naivefinder.jl @@ -83,10 +83,11 @@ function NaiveFinder( frame = strand == '+' ? framedict[location.start % 3] : framedict[(seqlen - location.stop + 1) % 3] seq = strand == '+' ? @view(sequence[start:stop]) : reverse_complement(@view(sequence[start:stop])) #@view(sequence[start:stop]) #features - score = scheme === nothing ? 0.0 : scheme(seq, ECOLICDS) # @view(sequence[start:stop] + scr = scheme === nothing ? 0.0 : scheme(seq, ECOLICDS) # @view(sequence[start:stop] #populate the feature dict - fts = Dict(:score => score) #:gc => gc, :length => len, + #fts = Dict(:score => score) #:gc => gc, :length => len, + fts = Features((score = scr,)) # rbs = RBS(biore"RRR"dna, 3:4, 1.0) push!(orfs, ORF{N,NaiveFinder}(seqname, start, stop, strand, frame, seq, fts, scheme)) end From 80a05941ca6fcdd40b2c5815eac2f9e9b71daeda Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Camilo=20Garc=C3=ADa?= Date: Sat, 8 Jun 2024 17:44:18 -0500 Subject: [PATCH 23/40] Update lors from BMC taking only one argument --- Manifest.toml | 6 +-- Project.toml | 2 +- src/algorithms/naivefinder.jl | 2 +- test/findorfstest.jl | 88 ++++++++++++++++++++--------------- test/runtests.jl | 4 +- 5 files changed, 58 insertions(+), 44 deletions(-) diff --git a/Manifest.toml b/Manifest.toml index a00ab9a..2448361 100644 --- a/Manifest.toml +++ b/Manifest.toml @@ -2,7 +2,7 @@ julia_version = "1.10.4" manifest_format = "2.0" -project_hash = "514456eec9673906fe896a3ae36685951a71c19f" +project_hash = "e13b3d62f031e8e91ec33e86f0dbf42fa3097ba0" [[deps.Automa]] deps = ["PrecompileTools", "TranscodingStreams"] @@ -21,9 +21,9 @@ version = "0.1.4" [[deps.BioMarkovChains]] deps = ["BioSequences", "PrecompileTools", "VectorizedKmers"] -git-tree-sha1 = "c48cc4efe18355a1a19ae627d05bf19df0a57253" +git-tree-sha1 = "72c9d5027120756a71fb7b1036a052b6f9353ed6" uuid = "f861b655-cb5f-42ce-b66a-341b542d4f2c" -version = "0.9.3" +version = "0.10.0" [deps.BioMarkovChains.extensions] DiscreteMarkovChainsExt = "DiscreteMarkovChains" diff --git a/Project.toml b/Project.toml index cae9ace..2fe732b 100644 --- a/Project.toml +++ b/Project.toml @@ -12,7 +12,7 @@ IterTools = "c8e1da08-722c-5040-9ed9-7db0dc04731e" PrecompileTools = "aea7be01-6a6a-4083-8856-8a6e6704d82a" [compat] -BioMarkovChains = "0.9" +BioMarkovChains = "0.10" BioSequences = "3" FASTX = "2" GenomicFeatures = "3" diff --git a/src/algorithms/naivefinder.jl b/src/algorithms/naivefinder.jl index 9a06a96..833d615 100644 --- a/src/algorithms/naivefinder.jl +++ b/src/algorithms/naivefinder.jl @@ -83,7 +83,7 @@ function NaiveFinder( frame = strand == '+' ? framedict[location.start % 3] : framedict[(seqlen - location.stop + 1) % 3] seq = strand == '+' ? @view(sequence[start:stop]) : reverse_complement(@view(sequence[start:stop])) #@view(sequence[start:stop]) #features - scr = scheme === nothing ? 0.0 : scheme(seq, ECOLICDS) # @view(sequence[start:stop] + scr = scheme === nothing ? 0.0 : scheme(seq; kwargs...) # @view(sequence[start:stop] #populate the feature dict #fts = Dict(:score => score) #:gc => gc, :length => len, diff --git a/test/findorfstest.jl b/test/findorfstest.jl index 3596075..c083ad7 100644 --- a/test/findorfstest.jl +++ b/test/findorfstest.jl @@ -1,41 +1,55 @@ -# @testset "findorfs" begin -# # cd(@__DIR__) -# # A random seq to start -# seq01 = dna"ATGATGCATGCATGCATGCTAGTAACTAGCTAGCTAGCTAGTAA" -# orfs01 = findorfs(seq01, NaiveFinder()) - -# @test orfs01 == [ORF{NaiveFinder}(1:33, '+', 1, nothing), ORF{NaiveFinder}(4:33, '+', 1, nothing), ORF{NaiveFinder}(8:22, '+', 2, nothing), ORF{NaiveFinder}(12:29, '+', 3, nothing), ORF{NaiveFinder}(16:33, '+', 1, nothing)] -# @test length(orfs01) == 5 - -# # > 180195.SAMN03785337.LFLS01000089 -> finds only 1 gene in Prodigal (from Pyrodigal tests) -# seq02 = dna"AACCAGGGCAATATCAGTACCGCGGGCAATGCAACCCTGACTGCCGGCGGTAACCTGAACAGCACTGGCAATCTGACTGTGGGCGGTGTTACCAACGGCACTGCTACTACTGGCAACATCGCACTGACCGGTAACAATGCGCTGAGCGGTCCGGTCAATCTGAATGCGTCGAATGGCACGGTGACCTTGAACACGACCGGCAATACCACGCTCGGTAACGTGACGGCACAAGGCAATGTGACGACCAATGTGTCCAACGGCAGTCTGACGGTTACCGGCAATACGACAGGTGCCAACACCAACCTCAGTGCCAGCGGCAACCTGACCGTGGGTAACCAGGGCAATATCAGTACCGCAGGCAATGCAACCCTGACGGCCGGCGACAACCTGACGAGCACTGGCAATCTGACTGTGGGCGGCGTCACCAACGGCACGGCCACCACCGGCAACATCGCGCTGACCGGTAACAATGCACTGGCTGGTCCTGTCAATCTGAACGCGCCGAACGGCACCGTGACCCTGAACACAACCGGCAATACCACGCTGGGTAATGTCACCGCACAAGGCAATGTGACGACTAATGTGTCCAACGGCAGCCTGACAGTCGCTGGCAATACCACAGGTGCCAACACCAACCTGAGTGCCAGCGGCAATCTGACCGTGGGCAACCAGGGCAATATCAGTACCGCGGGCAATGCAACCCTGACTGCCGGCGGTAACCTGAGC" -# orfs02 = findorfs(seq02, NaiveFinder()) - -# @test length(orfs02) == 12 -# @test orfs02 == [ORF{NaiveFinder}(29:40, '+', 2, nothing), ORF{NaiveFinder}(137:145, '+', 2, nothing), ORF{NaiveFinder}(164:184, '+', 2, nothing), ORF{NaiveFinder}(173:184, '+', 2, nothing), ORF{NaiveFinder}(236:241, '+', 2, nothing), ORF{NaiveFinder}(248:268, '+', 2, nothing), ORF{NaiveFinder}(362:373, '+', 2, nothing), ORF{NaiveFinder}(470:496, '+', 2, nothing), ORF{NaiveFinder}(551:574, '+', 2, nothing), ORF{NaiveFinder}(569:574, '+', 2, nothing), ORF{NaiveFinder}(581:601, '+', 2, nothing), ORF{NaiveFinder}(695:706, '+', 2, nothing)] - -# # From pyrodigal issue #13 link: https://github.com/althonos/pyrodigal/blob/1f939b0913b48dbaa55d574b20e124f1b8323825/pyrodigal/tests/test_orf_finder.py#L271 -# # Pyrodigal predicts 2 genes from this sequence: -# # 1) An alternative start codon (GTG) sequence at 48:347 -# # 2) A common start codon sequence at 426:590 -# # On the other hand, the NCBI ORFfinder program predicts 9 ORFs whose length is greater than 75 nt, from which one has an "outbound" stop -# seq03 = dna"TTCGTCAGTCGTTCTGTTTCATTCAATACGATAGTAATGTATTTTTCGTGCATTTCCGGTGGAATCGTGCCGTCCAGCATAGCCTCCAGATATCCCCTTATAGAGGTCAGAGGGGAACGGAAATCGTGGGATACATTGGCTACAAACTTTTTCTGATCATCCTCGGAACGGGCAATTTCGCTTGCCATATAATTCAGACAGGAAGCCAGATAACCGATTTCATCCTCACTATCGACCTGAAATTCATAATGCATATTACCGGCAGCATACTGCTCTGTGGCATGAGTGATCTTCCTCAGAGGAATATATACGATCTCAGTGAAAAAGATCAGAATGATCAGGGATAGCAGGAACAGGATTGCCAGGGTGATATAGGAAATATTCAGCAGGTTGTTACAGGATTTCTGAATATCATTCATATCAGTATGGATGACTACATAGCCTTTTACCTTGTAGTTGGAGGTAATGGGAGCAAATACAGTAAGTACATCCGAATCAAAATTACCGAAGAAATCACCAACAATGTAATAGGAGCCGCTGGTTACGGTCGAATCAAAATTCTCAATGACAACCACATTCTCCACATCTAAGGGACTATTGGTATCCAGTACCAGTCGTCCGGAGGGATTGATGATGCGAATCTCGGAATTCAGGTAGACCGCCAGGGAGTCCAGCTGCATTTTAACGGTCTCCAAAGTTGTTTCACTGGTGTACAATCCGCCGGCATAGGTTCCGGCGATCAGGGTTGCTTCGGAATAGAGACTTTCTGCCTTTTCCCGGATCAGATGTTCTTTGGTCATATTGGGAACAAAAGTTGTAACAATGATGAAACCAAATACACCAAAAATAAAATATGCGAGTATAAATTTTAGATAAAGTGTTTTTTTCATAACAAATCCTGCTTTTGGTATGACTTAATTACGTACTTCGAATTTATAGCCGATGCCCCAGATGGTGCTGATCTTCCAGTTGGCATGATCCTTGATCTTCTC" -# orfs03 = findorfs(seq03, NaiveFinder(), minlen=75) -# @test length(orfs03) == 9 -# @test orfs03 == [ORF{NaiveFinder}(37:156, '+', 1, nothing), ORF{NaiveFinder}(194:268, '-', 2, nothing), ORF{NaiveFinder}(194:283, '-', 2, nothing), ORF{NaiveFinder}(249:347, '+', 3, nothing), ORF{NaiveFinder}(426:590, '+', 3, nothing), ORF{NaiveFinder}(565:657, '+', 1, nothing), ORF{NaiveFinder}(650:727, '-', 2, nothing), ORF{NaiveFinder}(786:872, '+', 3, nothing), ORF{NaiveFinder}(887:976, '-', 2, nothing)] -# #|-> This occured in Pyrodigal -# # Lambda phage tests -# # Compare to https://github.com/jonas-fuchs/viral_orf_finder/blob/master/orf_finder.py -# # Salisbury and Tsorukas (2019) paper used the Lambda phage genome with 73 CDS and 545 non-CDS ORFs (a total of 618) to compare predictions between several Gene Finder programs -# # For a minimal length of 75 nt the following ORFs are predicted: -# # orf_finder.py --> 885 (222 complete) -# # findorfs (GeneFinder.jl) --> 885 -# # NCBI ORFfinder --> 375 ORFs -# # orfipy --> 375 (`orfipy NC_001416.1.fasta --start ATG --include-stop --min 75`) -# NC_001416 = fasta2bioseq("data/NC_001416.1.fasta")[1] -# NC_001416_orfs = findorfs(NC_001416, NaiveFinder(), minlen=75) -# @test length(NC_001416_orfs) == 885 +@testitem "findorfs tests" begin + # cd(@__DIR__) + # using GeneFinder: findorfs, NaiveFinder, ORF + # A random seq to start + using BioSequences: @dna_str + + seq01 = dna"ATGATGCATGCATGCATGCTAGTAACTAGCTAGCTAGCTAGTAA" + # orfs01 = findorfs(seq01, finder=NaiveFinder) + orf01 = ORF{4,NaiveFinder}("seq01", 1, 33, '+', 1, seq01[1:33], Dict(:score => 0.0), nothing) + sequence(orf01) == dna"ATGATGCATGCATGCATGCTAGTAACTAGCTAG" + groupname(orf01) == "seq01" + + # @test orfs01 == [ + # ORF{4,NaiveFinder}("seq01", 1, 33, '+', 1, seq01[1:33], Dict(:score => 0.0), nothing), + # ORF{4,NaiveFinder}("seq01", 4, 33, '+', 1, seq01[4:33], Dict(:score => 0.0), nothing), + # ORF{4,NaiveFinder}("seq01", 8, 22, '+', 2, seq01[8:22], Dict(:score => 0.0), nothing), + # ORF{4,NaiveFinder}("seq01", 12, 29, '+', 3, seq01[12:29], Dict(:score => 0.0), nothing), + # ORF{4,NaiveFinder}("seq01", 16, 33, '+', 1, seq01[16:33], Dict(:score => 0.0), nothing) + # ] + + # @test length(orfs01) == 5 +end + + # > 180195.SAMN03785337.LFLS01000089 -> finds only 1 gene in Prodigal (from Pyrodigal tests) + # seq02 = dna"AACCAGGGCAATATCAGTACCGCGGGCAATGCAACCCTGACTGCCGGCGGTAACCTGAACAGCACTGGCAATCTGACTGTGGGCGGTGTTACCAACGGCACTGCTACTACTGGCAACATCGCACTGACCGGTAACAATGCGCTGAGCGGTCCGGTCAATCTGAATGCGTCGAATGGCACGGTGACCTTGAACACGACCGGCAATACCACGCTCGGTAACGTGACGGCACAAGGCAATGTGACGACCAATGTGTCCAACGGCAGTCTGACGGTTACCGGCAATACGACAGGTGCCAACACCAACCTCAGTGCCAGCGGCAACCTGACCGTGGGTAACCAGGGCAATATCAGTACCGCAGGCAATGCAACCCTGACGGCCGGCGACAACCTGACGAGCACTGGCAATCTGACTGTGGGCGGCGTCACCAACGGCACGGCCACCACCGGCAACATCGCGCTGACCGGTAACAATGCACTGGCTGGTCCTGTCAATCTGAACGCGCCGAACGGCACCGTGACCCTGAACACAACCGGCAATACCACGCTGGGTAATGTCACCGCACAAGGCAATGTGACGACTAATGTGTCCAACGGCAGCCTGACAGTCGCTGGCAATACCACAGGTGCCAACACCAACCTGAGTGCCAGCGGCAATCTGACCGTGGGCAACCAGGGCAATATCAGTACCGCGGGCAATGCAACCCTGACTGCCGGCGGTAACCTGAGC" + # orfs02 = findorfs(seq02, NaiveFinder()) + + # @test length(orfs02) == 12 + # @test orfs02 == [ORF{NaiveFinder}(29:40, '+', 2, nothing), ORF{NaiveFinder}(137:145, '+', 2, nothing), ORF{NaiveFinder}(164:184, '+', 2, nothing), ORF{NaiveFinder}(173:184, '+', 2, nothing), ORF{NaiveFinder}(236:241, '+', 2, nothing), ORF{NaiveFinder}(248:268, '+', 2, nothing), ORF{NaiveFinder}(362:373, '+', 2, nothing), ORF{NaiveFinder}(470:496, '+', 2, nothing), ORF{NaiveFinder}(551:574, '+', 2, nothing), ORF{NaiveFinder}(569:574, '+', 2, nothing), ORF{NaiveFinder}(581:601, '+', 2, nothing), ORF{NaiveFinder}(695:706, '+', 2, nothing)] + + # From pyrodigal issue #13 link: https://github.com/althonos/pyrodigal/blob/1f939b0913b48dbaa55d574b20e124f1b8323825/pyrodigal/tests/test_orf_finder.py#L271 + # Pyrodigal predicts 2 genes from this sequence: + # 1) An alternative start codon (GTG) sequence at 48:347 + # 2) A common start codon sequence at 426:590 + # On the other hand, the NCBI ORFfinder program predicts 9 ORFs whose length is greater than 75 nt, from which one has an "outbound" stop + # seq03 = dna"TTCGTCAGTCGTTCTGTTTCATTCAATACGATAGTAATGTATTTTTCGTGCATTTCCGGTGGAATCGTGCCGTCCAGCATAGCCTCCAGATATCCCCTTATAGAGGTCAGAGGGGAACGGAAATCGTGGGATACATTGGCTACAAACTTTTTCTGATCATCCTCGGAACGGGCAATTTCGCTTGCCATATAATTCAGACAGGAAGCCAGATAACCGATTTCATCCTCACTATCGACCTGAAATTCATAATGCATATTACCGGCAGCATACTGCTCTGTGGCATGAGTGATCTTCCTCAGAGGAATATATACGATCTCAGTGAAAAAGATCAGAATGATCAGGGATAGCAGGAACAGGATTGCCAGGGTGATATAGGAAATATTCAGCAGGTTGTTACAGGATTTCTGAATATCATTCATATCAGTATGGATGACTACATAGCCTTTTACCTTGTAGTTGGAGGTAATGGGAGCAAATACAGTAAGTACATCCGAATCAAAATTACCGAAGAAATCACCAACAATGTAATAGGAGCCGCTGGTTACGGTCGAATCAAAATTCTCAATGACAACCACATTCTCCACATCTAAGGGACTATTGGTATCCAGTACCAGTCGTCCGGAGGGATTGATGATGCGAATCTCGGAATTCAGGTAGACCGCCAGGGAGTCCAGCTGCATTTTAACGGTCTCCAAAGTTGTTTCACTGGTGTACAATCCGCCGGCATAGGTTCCGGCGATCAGGGTTGCTTCGGAATAGAGACTTTCTGCCTTTTCCCGGATCAGATGTTCTTTGGTCATATTGGGAACAAAAGTTGTAACAATGATGAAACCAAATACACCAAAAATAAAATATGCGAGTATAAATTTTAGATAAAGTGTTTTTTTCATAACAAATCCTGCTTTTGGTATGACTTAATTACGTACTTCGAATTTATAGCCGATGCCCCAGATGGTGCTGATCTTCCAGTTGGCATGATCCTTGATCTTCTC" + # orfs03 = findorfs(seq03, NaiveFinder(), minlen=75) + # @test length(orfs03) == 9 + # @test orfs03 == [ORF{NaiveFinder}(37:156, '+', 1, nothing), ORF{NaiveFinder}(194:268, '-', 2, nothing), ORF{NaiveFinder}(194:283, '-', 2, nothing), ORF{NaiveFinder}(249:347, '+', 3, nothing), ORF{NaiveFinder}(426:590, '+', 3, nothing), ORF{NaiveFinder}(565:657, '+', 1, nothing), ORF{NaiveFinder}(650:727, '-', 2, nothing), ORF{NaiveFinder}(786:872, '+', 3, nothing), ORF{NaiveFinder}(887:976, '-', 2, nothing)] + #|-> This occured in Pyrodigal + # Lambda phage tests + # Compare to https://github.com/jonas-fuchs/viral_orf_finder/blob/master/orf_finder.py + # Salisbury and Tsorukas (2019) paper used the Lambda phage genome with 73 CDS and 545 non-CDS ORFs (a total of 618) to compare predictions between several Gene Finder programs + # For a minimal length of 75 nt the following ORFs are predicted: + # orf_finder.py --> 885 (222 complete) + # findorfs (GeneFinder.jl) --> 885 + # NCBI ORFfinder --> 375 ORFs + # orfipy --> 375 (`orfipy NC_001416.1.fasta --start ATG --include-stop --min 75`) + # NC_001416 = fasta2bioseq("data/NC_001416.1.fasta")[1] + # NC_001416_orfs = findorfs(NC_001416, NaiveFinder(), minlen=75) + # @test length(NC_001416_orfs) == 885 # end # @testitem "getorfs dna" default_imports=false begin diff --git a/test/runtests.jl b/test/runtests.jl index 2900ce2..3d34d6d 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -10,9 +10,9 @@ using TestItemRunner @run_package_tests -# include("findorfstest.jl") +include("findorfstest.jl") +include("aquatest.jl") # include("iotest.jl") # include("getindextest.jl") -# include("aquatest.jl") end From 44c791cae43e03393a1dff35a348b3d850fff477 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Camilo=20Garc=C3=ADa?= Date: Sat, 8 Jun 2024 17:44:58 -0500 Subject: [PATCH 24/40] Update naivecolletor.jl to new BMC losr --- src/algorithms/naivecollector.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/algorithms/naivecollector.jl b/src/algorithms/naivecollector.jl index 024e705..0293958 100644 --- a/src/algorithms/naivecollector.jl +++ b/src/algorithms/naivecollector.jl @@ -32,7 +32,7 @@ function NaiveCollector( frame = framedict[(seqlen - x.captured[3]) % 3] end seq = strand == '+' ? @view(sequence[start:stop]) : reverse_complement(@view(sequence[start:stop])) #@view(sequence[start:stop]) - scr = scheme === nothing ? 0.0 : scheme(seq, ECOLICDS) + scr = scheme === nothing ? 0.0 : scheme(seq; kwargs...) # fts = Dict(:score => score) fts = Features((score = scr,)) # rbs = RBS(biore"RRR"dna, 3:4, 1.0) return ORF{N,NaiveCollector}(seqname, start, stop, strand, frame, seq, fts, scheme) From 7663ab6f886472a58850c560b85fda52f7fdccd4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Camilo=20Garc=C3=ADa?= Date: Sun, 9 Jun 2024 01:07:24 -0500 Subject: [PATCH 25/40] Update iscoding method, so that it takes an ORF directly --- src/iscoding.jl | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/iscoding.jl b/src/iscoding.jl index 982efbc..ae74950 100644 --- a/src/iscoding.jl +++ b/src/iscoding.jl @@ -28,12 +28,11 @@ function iscoding( end function iscoding( - sequence::NucleicSeqOrView{DNAAlphabet{N}}, orf::ORF{N,F}; criteria::Function = lordr, kwargs... ) where {N,F<:GeneFinderMethod} - return criteria(sequence[orf]; kwargs...) + return criteria(sequence(orf); kwargs...) end From b01af90428055c1443102c805fc973a64d882e36 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Camilo=20Garc=C3=ADa?= Date: Sun, 9 Jun 2024 01:07:56 -0500 Subject: [PATCH 26/40] Update score method to handle directly the score field --- src/types.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/types.jl b/src/types.jl index f9d1755..75078a5 100644 --- a/src/types.jl +++ b/src/types.jl @@ -84,7 +84,7 @@ function sequence(i::ORF{N,F}) where {N,F} end function score(i::ORF{N,F}) where {N,F} - return i.features[:score] + return i.features.fts[:score] end function length(i::ORF{N,F}) where {N,F} From 784c1d5b40427bf7bde0de00da6eccf5bcae8a0f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Camilo=20Garc=C3=ADa?= Date: Sun, 9 Jun 2024 01:08:42 -0500 Subject: [PATCH 27/40] Update finder methods kwargs to only use the scheme kwargs --- src/algorithms/naivecollector.jl | 2 +- src/algorithms/naivefinder.jl | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/algorithms/naivecollector.jl b/src/algorithms/naivecollector.jl index 0293958..5035616 100644 --- a/src/algorithms/naivecollector.jl +++ b/src/algorithms/naivecollector.jl @@ -46,5 +46,5 @@ function NaiveCollector( append!(orfs, strandedorfs) end - return sort!(orfs; kwargs...) + return sort!(orfs) end diff --git a/src/algorithms/naivefinder.jl b/src/algorithms/naivefinder.jl index 833d615..54b8472 100644 --- a/src/algorithms/naivefinder.jl +++ b/src/algorithms/naivefinder.jl @@ -93,7 +93,7 @@ function NaiveFinder( end end end - return sort!(orfs; kwargs...) + return sort!(orfs) end # export createorf From 01c3e3e13a5c66f50b687e8b4150c66c3b926e80 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Camilo=20Garc=C3=ADa?= Date: Sun, 9 Jun 2024 01:09:39 -0500 Subject: [PATCH 28/40] After fixing the lors from BMC package the lordr is also updated, requires a new version of BMC. --- src/criteria/lordr.jl | 16 +++++----------- 1 file changed, 5 insertions(+), 11 deletions(-) diff --git a/src/criteria/lordr.jl b/src/criteria/lordr.jl index f2d446e..aed311c 100644 --- a/src/criteria/lordr.jl +++ b/src/criteria/lordr.jl @@ -41,19 +41,13 @@ iscoding(sequence) # Returns: true or false """ function lordr( #log_odds_ratio_decision, also lordr/cudr/kfdr/aadr sequence::NucleicSeqOrView{DNAAlphabet{N}}; - codingmodel::BioMarkovChain = ECOLICDS, - noncodingmodel::BioMarkovChain = ECOLINOCDS, - η::Float64 = 1e-5 + modela::BioMarkovChain = ECOLICDS, + modelb::BioMarkovChain = ECOLINOCDS, + b::Number = 2, + η::Float64 = 5e-3 ) where {N} - pcoding = dnaseqprobability(sequence, codingmodel) - pnoncoding = dnaseqprobability(sequence, noncodingmodel) - - logodds = log(pcoding / pnoncoding) - - length(sequence) % 3 == 0 || error("The sequence is not divisible by 3") - - !hasprematurestop(sequence) || error("There is a premature stop codon in the sequence") + logodds = log_odds_ratio_score(sequence; modela=modela, modelb=modelb, b=b) if logodds > η return true else From 24b05b56487671f1afe32a44996cdc8602ae7d75 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Camilo=20Garc=C3=ADa?= Date: Sun, 9 Jun 2024 14:11:57 -0500 Subject: [PATCH 29/40] Update to BMC v0.10.1 --- Manifest.toml | 4 ++-- src/GeneFinder.jl | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/Manifest.toml b/Manifest.toml index 2448361..f244a35 100644 --- a/Manifest.toml +++ b/Manifest.toml @@ -21,9 +21,9 @@ version = "0.1.4" [[deps.BioMarkovChains]] deps = ["BioSequences", "PrecompileTools", "VectorizedKmers"] -git-tree-sha1 = "72c9d5027120756a71fb7b1036a052b6f9353ed6" +git-tree-sha1 = "ff9f9e36892cf21222f2f2a0fe709a61b5d8dbfe" uuid = "f861b655-cb5f-42ce-b66a-341b542d4f2c" -version = "0.10.0" +version = "0.10.1" [deps.BioMarkovChains.extensions] DiscreteMarkovChainsExt = "DiscreteMarkovChains" diff --git a/src/GeneFinder.jl b/src/GeneFinder.jl index 467c120..3999e94 100644 --- a/src/GeneFinder.jl +++ b/src/GeneFinder.jl @@ -23,7 +23,7 @@ using BioSequences: translate, gc_content -using BioMarkovChains: BioMarkovChain, dnaseqprobability, ECOLICDS, ECOLINOCDS, log_odds_ratio_score +using BioMarkovChains: BioMarkovChain, ECOLICDS, ECOLINOCDS, log_odds_ratio_score using FASTX: FASTAReader, sequence using IterTools: takewhile, iterated using GenomicFeatures: GenomicFeatures, AbstractGenomicInterval, GenomicInterval, Strand, summary, groupname, strand, metadata, STRAND_POS, STRAND_NEG, STRAND_BOTH, STRAND_NA, leftposition, rightposition, length From 204765625fba17161a238c8e978ee4da6f3051b0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Camilo=20Garc=C3=ADa?= Date: Sun, 9 Jun 2024 14:13:46 -0500 Subject: [PATCH 30/40] Update some docstring in lordr --- src/criteria/lordr.jl | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/criteria/lordr.jl b/src/criteria/lordr.jl index aed311c..32cad02 100644 --- a/src/criteria/lordr.jl +++ b/src/criteria/lordr.jl @@ -3,8 +3,8 @@ export log_odds_ratio_decision_rule, lordr, lors @doc raw""" log_odds_ratio_decision_rule( sequence::LongSequence{DNAAlphabet{4}}; - codingmodel::BioMarkovChain, - noncodingmodel::BioMarkovChain, + modela::BioMarkovChain, + modelb::BioMarkovChain, η::Float64 = 1e-5 ) @@ -22,6 +22,7 @@ S(X) = \log \left( \frac{{P_C(X_1=i_1, \ldots, X_T=i_T)}}{{P_N(X_1=i_1, \ldots, ## Keyword Arguments - `codingmodel::BioMarkovChain`: The transition model for coding regions, (default: `ECOLICDS`). - `noncodingmodel::BioMarkovChain`: The transition model for non-coding regions, (default: `ECOLINOCDS`) +- `b::Number = 2`: The base of the logarithm used to calculate the log-odds ratio (default: 2). - `η::Float64 = 1e-5`: The threshold value (eta) for the log-odds ratio (default: 1e-5). # Returns From f21c4d991dc088509172a332f660fa0133535e6a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Camilo=20Garc=C3=ADa?= Date: Mon, 1 Jul 2024 14:12:47 -0500 Subject: [PATCH 31/40] Update deps --- Manifest.toml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/Manifest.toml b/Manifest.toml index f244a35..88633f8 100644 --- a/Manifest.toml +++ b/Manifest.toml @@ -70,9 +70,9 @@ uuid = "ade2ca70-3891-5945-98fb-dc099432e06a" [[deps.FASTX]] deps = ["Automa", "BioGenerics", "PrecompileTools", "StringViews", "TranscodingStreams"] -git-tree-sha1 = "24ce37a228990be0cb69b3a2dbcfb656f32fc679" +git-tree-sha1 = "b998f092c8fe26f0ca417a12df18da5340153f34" uuid = "c2308a5c-f048-11e8-3e8a-31650f418d12" -version = "2.1.5" +version = "2.1.6" weakdeps = ["BioSequences"] [deps.FASTX.extensions] @@ -147,9 +147,9 @@ uuid = "fa267f1f-6049-4f14-aa54-33bafae1ed76" version = "1.0.3" [[deps.TranscodingStreams]] -git-tree-sha1 = "a947ea21087caba0a798c5e494d0bb78e3a1a3a0" +git-tree-sha1 = "d73336d81cafdc277ff45558bb7eaa2b04a8e472" uuid = "3bb67fe8-82b1-5028-8e26-92a6c54297fa" -version = "0.10.9" +version = "0.10.10" [deps.TranscodingStreams.extensions] TestExt = ["Test", "Random"] From 91cb0bb9ad3a426db712f86262815c0abff1d334 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Camilo=20Garc=C3=ADa?= Date: Mon, 1 Jul 2024 14:13:05 -0500 Subject: [PATCH 32/40] Update getindex of ORFs --- src/extended.jl | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/src/extended.jl b/src/extended.jl index e45cd44..b803858 100644 --- a/src/extended.jl +++ b/src/extended.jl @@ -4,12 +4,20 @@ import BioSequences: translate Base.isless(a::ORF{N,F}, b::ORF{N,F}) where {N,F} = isless(a.first:a.last, b.first:b.last) -function getindex(sequence::NucleicSeqOrView{A}, orf::ORF{N,F}) where {A,N,F} - if orf.strand == '+' || orf.strand == STRAND_POS - return @view sequence[orf.first:orf.last] +function getindex(seq::NucleicSeqOrView{A}, orf::ORF{N,F}) where {A,N,F} + @assert @view(seq[begin:end]) == source(orf) "The source sequence of the ORF and the given sequence are different" + + if orf.strand == STRAND_POS + s = @view seq[orf.first:orf.last] else - return reverse_complement(@view sequence[orf.first:orf.last]) + s = reverse_complement(@view seq[orf.first:orf.last]) + end + + if !occursin(biore"T(AG|AA|GA)"dna, s) + error("There is no stop codon at the end of the sequence/orf") end + + return s end function translate(orf::ORF{N,F}; kwargs...) where {N,F} From 5235e69df5f60a3e21e367c6d3bd2b1403d93902 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Camilo=20Garc=C3=ADa?= Date: Mon, 1 Jul 2024 14:14:06 -0500 Subject: [PATCH 33/40] reduce ORF type fields: get rid of seq --- src/types.jl | 61 +++++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 49 insertions(+), 12 deletions(-) diff --git a/src/types.jl b/src/types.jl index 75078a5..f080d2c 100644 --- a/src/types.jl +++ b/src/types.jl @@ -2,8 +2,8 @@ import GenomicFeatures: first, last, length, strand, groupname, metadata import FASTX: sequence export Features, RBS, ORF -export features, sequence -export groupname, finder, frame, scheme, score +export features, sequence, source +export groupname, finder, frame, scheme, score, strand, STRAND_BOTH, STRAND_NEG, STRAND_POS, STRAND_NA #### Ribosome Binding Site (RBS) struct #### @@ -26,19 +26,33 @@ struct RBS end end - # Structs associated with gene models # const FEATUREDICT = Dict{Symbol,Any}(:gc => 0.0, :length => 0, :score => 0.0) # const Feature = Union{Real, Vector{RBS}} +# features::Dict{Symbol,Any} or # features::@NamedTuple{score::Float64, rbs::Any} +# seq::LongSubSeq{DNAAlphabet{N}} struct ORF{N,F} <: GenomicFeatures.AbstractGenomicInterval{F} groupname::String first::Int64 last::Int64 strand::Strand frame::Int - seq::LongSubSeq{DNAAlphabet{N}} - features::Features # features::Dict{Symbol,Any} or # features::@NamedTuple{score::Float64, rbs::Any} + features::Features scheme::Union{Nothing,Function} + + function ORF{N,F}( + groupname::String, + first::Int64, + last::Int64, + strand::Strand, + frame::Int, + features::Features, + scheme::Union{Nothing,Function} + ) where {N,F} + @assert frame in (1, 2, 3) "Invalid frame. Please provide a frame between 1 and 3." + + return new{N,F}(groupname, first, last, strand, frame, features, scheme) + end end function ORF{N,F}( @@ -46,17 +60,29 @@ function ORF{N,F}( groupname::String, first::Int64, last::Int64, - strand::Union{Strand,Char}, + strand::Strand, frame::Int, - seq::LongSubSeq{DNAAlphabet{N}}, features::Features, # ::Dict{Symbol,Any} or # ::@NamedTuple{score::Float64, rbs::Any} or @NamedTuple{Vararg{typeof(...)}} scheme::Union{Nothing,Function}=nothing ) where {N,F<:GeneFinderMethod} - # @assert frame in (1, 2, 3) && location[1] < location[end] && score isa Union{Nothing, Float64} && length(seq) == last(location) - first(location) + 1 && length(seq) % 3 == 0 "Invalid input. Please check the frame, location, score, and sequence length." - # groupname == "" && error("Please provide a groupname for the ORF.") - return ORF{N,F}(groupname, first, last, strand, frame, seq, features, scheme) #finder + return ORF{N,F}(groupname, first, last, strand, frame, features, scheme) #finder seq end + +# function ORF{N,F}( +# ::Type{F}, #finder +# groupname::Union{Nothing,String}, +# first::Int64, +# last::Int64, +# strand::Strand, +# frame::Int, +# features::Features, # ::Dict{Symbol,Any} or # ::@NamedTuple{score::Float64, rbs::Any} or @NamedTuple{Vararg{typeof(...)}} +# scheme::Union{Nothing,Function}=nothing +# ) where {N,F<:GeneFinderMethod} +# groupname = groupname === nothing ? "seq" : groupname +# return ORF{N,F}(groupname, first, last, strand, frame, features, scheme) #finder seq +# end + function groupname(i::ORF{N,F}) where {N,F} return i.groupname end @@ -80,7 +106,14 @@ function scheme(i::ORF{N,F}) where {N,F} end function sequence(i::ORF{N,F}) where {N,F} - return i.seq #i.strand == STRAND_POS ? i.seq : reverse_complement(i.seq) + seqsymb = Symbol(i.groupname) + seq = getfield(Main, seqsymb) + return i.strand == STRAND_POS ? @view(seq[i.first:i.last]) : reverse_complement(@view(seq[i.first:i.last])) #seq[i.first:i.last] +end + +function source(i::ORF{N,F}) where {N,F} + seqsymb = Symbol(i.groupname) + return getfield(Main, seqsymb) end function score(i::ORF{N,F}) where {N,F} @@ -88,13 +121,17 @@ function score(i::ORF{N,F}) where {N,F} end function length(i::ORF{N,F}) where {N,F} - return length(i.seq) + return length(sequence(i)) end function features(i::ORF{N,F}) where {N,F} return i.features.fts end +function strand(i::ORF{N,F}) where {N,F} + return i.strand +end + function Base.show(io::IO, i::ORF{N,F}) where {N,F} if get(io, :compact, false) print(io, "ORF{$(finder(i))}($(leftposition(i)):$(rightposition(i)), '$(strand(i))', $(frame(i)))") #{$(typeof(finder(i)))} $(score(i)) From 4607f96906279f0dab6a54d7f4a5f21fba66c8a6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Camilo=20Garc=C3=ADa?= Date: Mon, 1 Jul 2024 14:14:49 -0500 Subject: [PATCH 34/40] Add new internal methods for calling var names and symbol --- src/algorithms/naivecollector.jl | 4 +- src/algorithms/naivefinder.jl | 99 ++++++++------------------------ src/utils.jl | 31 ++++++++-- test/Project.toml | 2 + test/aquatest.jl | 2 +- test/findorfstest.jl | 20 +++++-- test/runtests.jl | 3 + 7 files changed, 73 insertions(+), 88 deletions(-) diff --git a/src/algorithms/naivecollector.jl b/src/algorithms/naivecollector.jl index 5035616..be46d60 100644 --- a/src/algorithms/naivecollector.jl +++ b/src/algorithms/naivecollector.jl @@ -16,7 +16,7 @@ function NaiveCollector( framedict = Dict(0 => 3, 1 => 1, 2 => 2) revseq = reverse_complement(sequence) seqlen = length(sequence) - seqname = get_var_name(sequence) + seqname = _varname(sequence) function createorfs(x, strand) if length(x.captured[1]:x.captured[3]) < minlen @@ -35,7 +35,7 @@ function NaiveCollector( scr = scheme === nothing ? 0.0 : scheme(seq; kwargs...) # fts = Dict(:score => score) fts = Features((score = scr,)) # rbs = RBS(biore"RRR"dna, 3:4, 1.0) - return ORF{N,NaiveCollector}(seqname, start, stop, strand, frame, seq, fts, scheme) + return ORF{N,NaiveCollector}(seqname, start, stop, strand, frame, fts, scheme) # seq end orfs = Vector{ORF{N,NaiveCollector}}() diff --git a/src/algorithms/naivefinder.jl b/src/algorithms/naivefinder.jl index 54b8472..f09f4af 100644 --- a/src/algorithms/naivefinder.jl +++ b/src/algorithms/naivefinder.jl @@ -32,11 +32,11 @@ function _locationiterator( end @doc raw""" - naivefinder(sequence::NucleicSeqOrView{DNAAlphabet{N}}; kwargs...) -> Vector{ORF} where {N} + NaiveFinder(sequence::NucleicSeqOrView{DNAAlphabet{N}}; kwargs...) -> Vector{ORF} where {N} A simple implementation that finds ORFs in a DNA sequence. -The `naivefinder` function takes a LongSequence{DNAAlphabet{4}} sequence and returns a Vector{ORF} containing the ORFs found in the sequence. +The `NaiveFinder` method takes a LongSequence{DNAAlphabet{4}} sequence and returns a Vector{ORF} containing the ORFs found in the sequence. It searches entire regularly expressed CDS, adding each ORF it finds to the vector. The function also searches the reverse complement of the sequence, so it finds ORFs on both strands. Extending the starting codons with the `alternative_start = true` will search for ATG, GTG, and TTG. Some studies have shown that in *E. coli* (K-12 strain), ATG, GTG and TTG are used 83 %, 14 % and 3 % respectively. @@ -72,86 +72,35 @@ function NaiveFinder( seqlen = length(sequence) framedict = Dict(0 => 3, 1 => 1, 2 => 2) orfs = Vector{ORF{N,NaiveFinder}}() - seqname = get_var_name(sequence) - for strand in ('+', '-') - s = strand == '-' ? reverse_complement(sequence) : sequence + + # Handle the sequence name + seqname = _varname(sequence) + if seqname === nothing + seqname = "unnamedseq" + end + + for strand in (STRAND_POS, STRAND_NEG) + s = strand == STRAND_NEG ? reverse_complement(sequence) : sequence @inbounds for location in @views _locationiterator(s; alternative_start) if length(location) >= minlen #main fields - start = strand == '+' ? location.start : seqlen - location.stop + 1 + start = strand == STRAND_POS ? location.start : seqlen - location.stop + 1 stop = start + length(location) - 1 - frame = strand == '+' ? framedict[location.start % 3] : framedict[(seqlen - location.stop + 1) % 3] - seq = strand == '+' ? @view(sequence[start:stop]) : reverse_complement(@view(sequence[start:stop])) #@view(sequence[start:stop]) - #features - scr = scheme === nothing ? 0.0 : scheme(seq; kwargs...) # @view(sequence[start:stop] - - #populate the feature dict - #fts = Dict(:score => score) #:gc => gc, :length => len, + frame = strand == STRAND_POS ? framedict[location.start % 3] : framedict[(seqlen - location.stop + 1) % 3] + + if scheme === nothing + scr = 0.0 + else + seq = strand == STRAND_POS ? @view(sequence[start:stop]) : reverse_complement(@view(sequence[start:stop])) + scr = scheme(seq; kwargs...) + end + + #populate the feature tuple fts = Features((score = scr,)) # rbs = RBS(biore"RRR"dna, 3:4, 1.0) - push!(orfs, ORF{N,NaiveFinder}(seqname, start, stop, strand, frame, seq, fts, scheme)) + push!(orfs, ORF{N,NaiveFinder}(seqname, start, stop, strand, frame, fts, scheme)) #seq end end end return sort!(orfs) -end - -# export createorf - -# function createorf( -# sequence::NucleicSeqOrView{DNAAlphabet{N}}, -# seqname::String, -# location::UnitRange{Int64}, -# strand::Char, -# seqlen::Int64, -# minlen::Int64, -# scheme::Union{Nothing,Function} -# ) where {N} -# if length(location) < minlen -# return nothing -# end - -# framedict = Dict(0 => 3, 1 => 1, 2 => 2) - -# if strand == '+' -# start = location.start -# stop = start + length(location) - 1 -# frame = framedict[location.start % 3] -# else -# start = seqlen - location.stop + 1 -# stop = start + length(location) - 1 -# frame = framedict[(seqlen - location.stop + 1) % 3] -# end - -# seq = strand == '+' ? @view(sequence[start:stop]) : reverse_complement(@view(sequence[start:stop])) -# score = scheme === nothing ? 0 : scheme(@view(seq[start:stop]), ECOLICDS) -# gc = gc_content(seq) -# len = length(seq) -# fts = Dict(:gc => gc, :length => len, :score => score) - -# return ORF{N, NaiveFinder}(seqname, start, stop, strand, frame, seq, fts, scheme) -# end - -# function NaiveFinder( -# sequence::NucleicSeqOrView{DNAAlphabet{N}}; -# alternative_start::Bool = false, -# minlen::Int64 = 6, -# scheme::Union{Nothing,Function} = nothing, -# kwargs... -# ) where {N} -# seqlen = length(sequence) -# orfs = Vector{ORF{N,NaiveFinder}}() -# seqname = get_var_name(sequence) - -# @inbounds for strand in ('+', '-') -# seq = strand == '-' ? reverse_complement(sequence) : sequence -# @inbounds for location in @views _locationiterator(s; alternative_start) -# orf = createorf(seq, seqname, location, strand, seqlen, minlen, scheme) -# if orf !== nothing -# push!(orfs, orf) -# end -# end -# end - -# return sort!(orfs; kwargs...) -# end \ No newline at end of file +end \ No newline at end of file diff --git a/src/utils.jl b/src/utils.jl index f40a09f..f1528c4 100644 --- a/src/utils.jl +++ b/src/utils.jl @@ -1,4 +1,4 @@ -export hasprematurestop, fasta2bioseq, get_var_name +export hasprematurestop, fasta2bioseq, _varname, _varsymbol # General purposes methods supporting main functions @@ -15,7 +15,9 @@ function hasprematurestop(sequence::NucleicSeqOrView{DNAAlphabet{N}})::Bool wher length(sequence) % 3 == 0 || error("The sequence is not divisible by 3") - occursin(biore"T(AG|AA|GA)"dna, sequence[end-2:end]) || error("There is no stop codon at the end of the sequence") + #TODO: this way of checking the stop codon at the end is idiosyncratic and should be improved since other stop codons are not considered + #TODO: This is currently used by the `getindex` method in `extended.jl` + occursin(biore"T(AG|AA|GA)"dna, sequence[end-2:end]) || error("There is no stop codon at the end of the sequence/orf") @inbounds for i in 1:3:length(sequence) - 4 codon = sequence[i:i+2] @@ -38,10 +40,29 @@ function fasta2bioseq(input::AbstractString)::Vector{LongSequence{DNAAlphabet{4} end end -function get_var_name(var::NucleicSeqOrView{DNAAlphabet{N}}) where {N} +function _varname(var::Any) for name in names(Main) - if getfield(Main, name) === var - return string(name) + try + if getfield(Main, name) === var + return string(name) + end + catch e + # Skip if getfield fails, e.g., for names that cannot be accessed directly + continue + end + end + return nothing +end + +function _varsymbol(var::Any) + for name in names(Main) + try + if getfield(Main, name) === var + return Symbol(name) + end + catch e + # Skip if getfield fails, e.g., for names that cannot be accessed directly + continue end end return nothing diff --git a/test/Project.toml b/test/Project.toml index baa2a68..aa664ea 100644 --- a/test/Project.toml +++ b/test/Project.toml @@ -1,7 +1,9 @@ [deps] Aqua = "4c88cf16-eb10-579e-8560-4a9242c79595" +BioMarkovChains = "f861b655-cb5f-42ce-b66a-341b542d4f2c" BioSequences = "7e6ae17a-c86d-528c-b3b9-7f778a29fe59" FASTX = "c2308a5c-f048-11e8-3e8a-31650f418d12" +GenomicFeatures = "899a7d2d-5c61-547b-bef9-6698a8d05446" Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" TestItemRunner = "f8b46487-2199-4994-9208-9a1283c18c0a" TestItems = "1c621080-faea-4a02-84b6-bbd5e436b8fe" diff --git a/test/aquatest.jl b/test/aquatest.jl index 131c435..7cdf389 100644 --- a/test/aquatest.jl +++ b/test/aquatest.jl @@ -4,7 +4,7 @@ Aqua.test_persistent_tasks(GeneFinder) Aqua.test_piracies(GeneFinder) Aqua.test_stale_deps(GeneFinder) - Aqua.test_unbound_args(GeneFinder) + # Aqua.test_unbound_args(GeneFinder) Aqua.test_undefined_exports(GeneFinder) end \ No newline at end of file diff --git a/test/findorfstest.jl b/test/findorfstest.jl index c083ad7..6b95ad4 100644 --- a/test/findorfstest.jl +++ b/test/findorfstest.jl @@ -3,13 +3,23 @@ # cd(@__DIR__) # using GeneFinder: findorfs, NaiveFinder, ORF # A random seq to start - using BioSequences: @dna_str + using BioSequences: @dna_str + using GeneFinder: findorfs, NaiveFinder, ORF, source + # using GeneFinder: findorfs, NaiveFinder, ORF, Features, STRAND_POS, STRAND_NEG, source, sequence, groupname + seq01 = dna"ATGATGCATGCATGCATGCTAGTAACTAGCTAGCTAGCTAGTAA" - # orfs01 = findorfs(seq01, finder=NaiveFinder) - orf01 = ORF{4,NaiveFinder}("seq01", 1, 33, '+', 1, seq01[1:33], Dict(:score => 0.0), nothing) - sequence(orf01) == dna"ATGATGCATGCATGCATGCTAGTAACTAGCTAG" - groupname(orf01) == "seq01" + + orf01 = ORF{4,NaiveFinder}("seq01", 1, 33, STRAND_POS, 1, Features((score = 0.0,)), nothing) # ORF{4,NaiveFinder}("phi", 7, 15, '+', 1, Features((score = 0,)), nothing) + # source(orf01) + # GeneFinder.source(orf01) + orfs = findorfs(seq01) + @view(seq01[begin:end]) + seq01 + # seq01[orf01] + # sequence(orf01) == dna"ATGATGCATGCATGCATGCTAGTAACTAGCTAG" + + # groupname(orf01) == "seq01" # @test orfs01 == [ # ORF{4,NaiveFinder}("seq01", 1, 33, '+', 1, seq01[1:33], Dict(:score => 0.0), nothing), diff --git a/test/runtests.jl b/test/runtests.jl index 3d34d6d..59966bf 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -3,7 +3,9 @@ module GeneFinderTests using Test using BioSequences using FASTX +using GenomicFeatures using GeneFinder +using BioMarkovChains using Aqua using TestItems using TestItemRunner @@ -16,3 +18,4 @@ include("aquatest.jl") # include("getindextest.jl") end + From 64192195eb3bbd760f217d3717e0785fff56fa9b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Camilo=20Garc=C3=ADa?= Date: Mon, 1 Jul 2024 14:31:51 -0500 Subject: [PATCH 35/40] Update docs simple coding rule --- docs/src/simplecodingrule.md | 117 ++++++++++++++++++++++++++++------- 1 file changed, 94 insertions(+), 23 deletions(-) diff --git a/docs/src/simplecodingrule.md b/docs/src/simplecodingrule.md index c6bbaf6..f8ec5ac 100644 --- a/docs/src/simplecodingrule.md +++ b/docs/src/simplecodingrule.md @@ -1,6 +1,6 @@ ## Scoring a sequence using a Markov model -A sequence of DNA could be scored using a Markov model of the transition probabilities of a known sequence. This could be done using a *log-odds ratio* score, which is the logarithm of the ratio of the transition probabilities of the sequence given a model and. The log-odds ratio score is defined as: +A sequence of DNA could be scored using a Markov model of the transition probabilities of a known sequence. This could be done using a *log-odds ratio score*, which is the logarithm of the ratio of the transition probabilities of the sequence given a model and. The log-odds ratio score is defined as: ```math \begin{align} @@ -17,20 +17,18 @@ using GeneFinder, BioSequences seq = dna"TTCGTCAGTCGTTCTGTTTCATTCAATACGATAGTAATGTATTTTTCGTGCATTTCCGGTGGAATCGTGCCGTCCAGCATAGCCTCCAGATATCCCCTTATAGAGGTCAGAGGGGAACGGAAATCGTGGGATACATTGGCTACAAACTTTTTCTGATCATCCTCGGAACGGGCAATTTCGCTTGCCATATAATTCAGACAGGAAGCCAGATAACCGATTTCATCCTCACTATCGACCTGAAATTCATAATGCATATTACCGGCAGCATACTGCTCTGTGGCATGAGTGATCTTCCTCAGAGGAATATATACGATCTCAGTGAAAAAGATCAGAATGATCAGGGATAGCAGGAACAGGATTGCCAGGGTGATATAGGAAATATTCAGCAGGTTGTTACAGGATTTCTGAATATCATTCATATCAGTATGGATGACTACATAGCCTTTTACCTTGTAGTTGGAGGTAATGGGAGCAAATACAGTAAGTACATCCGAATCAAAATTACCGAAGAAATCACCAACAATGTAATAGGAGCCGCTGGTTACGGTCGAATCAAAATTCTCAATGACAACCACATTCTCCACATCTAAGGGACTATTGGTATCCAGTACCAGTCGTCCGGAGGGATTGATGATGCGAATCTCGGAATTCAGGTAGACCGCCAGGGAGTCCAGCTGCATTTTAACGGTCTCCAAAGTTGTTTCACTGGTGTACAATCCGCCGGCATAGGTTCCGGCGATCAGGGTTGCTTCGGAATAGAGACTTTCTGCCTTTTCCCGGATCAGATGTTCTTTGGTCATATTGGGAACAAAAGTTGTAACAATGATGAAACCAAATACACCAAAAATAAAATATGCGAGTATAAATTTTAGATAAAGTGTTTTTTTCATAACAAATCCTGCTTTTGGTATGACTTAATTACGTACTTCGAATTTATAGCCGATGCCCCAGATGGTGCTGATCTTCCAGTTGGCATGATCCTTGATCTTCTC" -orfs = findorfs(seq, minlen=75, NaiveFinderScored()) -``` - -```julia -9-element Vector{ORF}: - ORF(37:156, '+', 1, -0.0024384392084479912) - ORF(194:268, '-', 2, -0.026759927376272922) - ORF(194:283, '-', 2, -0.010354615336667268) - ORF(249:347, '+', 3, -0.02024959025464718) - ORF(426:590, '+', 3, -0.003289228147424537) - ORF(565:657, '+', 1, -0.014806468147370438) - ORF(650:727, '-', 2, -0.04303976584597201) - ORF(786:872, '+', 3, -0.03486633273294755) - ORF(887:976, '-', 2, -0.021989277630330317) +findorfs(seq, minlen=75, finder=NaiveFinder) + +9-element Vector{ORF{4, NaiveFinder}}: + ORF{NaiveFinder}(37:156, '+', 1,) + ORF{NaiveFinder}(194:268, '-', 2) + ORF{NaiveFinder}(194:283, '-', 2) + ORF{NaiveFinder}(249:347, '+', 3) + ORF{NaiveFinder}(426:590, '+', 3) + ORF{NaiveFinder}(565:657, '+', 1) + ORF{NaiveFinder}(650:727, '-', 2) + ORF{NaiveFinder}(786:872, '+', 3) + ORF{NaiveFinder}(887:976, '-', 2) ``` ## The *log-odds ratio* decision rule @@ -48,9 +46,9 @@ Where the ``P_{C}`` is the probability of the sequence given a CDS model, ``P_{N In this package we have implemented this rule and call some basic models of CDS and No-CDS of *E. coli* from Axelson-Fisk (2015) work (implemented in `BioMarkovChains.jl` package). To check whether a random sequence could be coding based on these decision we use the predicate `log_odds_ratio_decision_rule` with the `ECOLICDS` and `ECOLINOCDS` models: ```julia -orfsdna = getorfs(seq,DNAAlphabet{4}(), NaiveFinder(), minlen=75, alternative_start=true) +orfsdna = findorfs(seq, minlen=75, alternative_start=true) .|> sequence -20-element Vector{LongSubSeq{DNAAlphabet{4}}}: +20-element Vector{NucSeq{4, DNAAlphabet{4}}} ATGTATTTTTCGTGCATTTCCGGTGGAATCGTGCCGTCC…CGGAAATCGTGGGATACATTGGCTACAAACTTTTTCTGA GTGCATTTCCGGTGGAATCGTGCCGTCCAGCATAGCCTC…TACGATCTCAGTGAAAAAGATCAGAATGATCAGGGATAG GTGCCGTCCAGCATAGCCTCCAGATATCCCCTTATAGAG…CGGAAATCGTGGGATACATTGGCTACAAACTTTTTCTGA @@ -74,6 +72,34 @@ orfsdna = getorfs(seq,DNAAlphabet{4}(), NaiveFinder(), minlen=75, alternative_s ``` +Now, we can score the sequences using the *log-odds ratio* score in the same line: + +```julia +orfsfeat = findorfs(seq, minlen=75, alternative_start=true, scheme=lors) .|> features + +20-element Vector{@NamedTuple{score::Float64}}: + (score = -2.5146325834372343,) + (score = -4.857592765476053,) + (score = -1.9986133020444345,) + (score = -3.4106894574555824,) + (score = -1.763485388728319,) + (score = 0.6825864481251348,) + (score = 0.21287161698917936,) + (score = -0.28187825646085224,) + (score = -1.373474082107631,) + (score = -4.273794970087796,) + (score = -2.3961559066784597,) + (score = -2.3663038090046142,) + (score = -0.8406863072332524,) + (score = 1.8013554455006733,) + (score = -2.0768031699080756,) + (score = -1.734088708668584,) + (score = -2.9820908143871194,) + (score = -3.072550585883162,) + (score = -2.712493281013948,) + (score = -2.0453354284951786,) +``` + Now the question is which of those sequences can we consider as coding sequences. We can use the `iscoding` predicate to check whether a sequence is coding or not based on the *log-odds ratio* decision rule: ```julia @@ -109,7 +135,7 @@ In this case, the sequence has 20 ORFs and only 3 of them are classified as codi ```julia orfs = filter(orf -> iscoding(orf), orfsdna) -3-element Vector{LongSubSeq{DNAAlphabet{4}}}: +3-element Vector{NucSeq{4, DNAAlphabet{4}}} ATGCTGCCGGTAATATGCATTATGAATTTCAGGTCGATAGTGAGGATGAAATCGGTTATCTGGCTTCCTGTCTGA ATGCCACAGAGCAGTATGCTGCCGGTAATATGCATTATG…ATAGTGAGGATGAAATCGGTTATCTGGCTTCCTGTCTGA ATGCCGGCGGATTGTACACCAGTGAAACAACTTTGGAGACCGTTAAAATGCAGCTGGACTCCCTGGCGGTCTACCTGA @@ -118,11 +144,56 @@ orfs = filter(orf -> iscoding(orf), orfsdna) Or in terms of the `ORF` object: ```julia -orfs = findorfs(seq, minlen=75, NaiveFinderScored(), alternative_start=true) # find ORFs with alternative start as well +orfs = findorfs(seq, minlen=75, finder=NaiveFinder, alternative_start=true) # find ORFs with alternative start as well orfs[iscoding.(orfsdna)] - 3-element Vector{ORF}: - ORF(194:268, '-', 2, -0.026759927376272922) - ORF(194:283, '-', 2, -0.010354615336667268) - ORF(650:727, '-', 2, -0.04303976584597201) +3-element Vector{ORF{4, NaiveFinder}}: + ORF{NaiveFinder}(194:268, '-', 2, -0.026759927376272922) + ORF{NaiveFinder}(194:283, '-', 2, -0.010354615336667268) + ORF{NaiveFinder}(650:727, '-', 2, -0.04303976584597201) +``` + + +Or in a single line using another genome sequence: + +```julia + +phi = dna"GTGTGAGGTTATAACGCCGAAGCGGTAAAAATTTTAATTTTTGCCGCTGAGGGGTTGACCAAGCGAAGCGCGGTAGGTTTTCTGCTTAGGAGTTTAATCATGTTTCAGACTTTTATTTCTCGCCATAATTCAAACTTTTTTTCTGATAAGCTGGTTCTCACTTCTGTTACTCCAGCTTCTTCGGCACCTGTTTTACAGACACCTAAAGCTACATCGTCAACGTTATATTTTGATAGTTTGACGGTTAATGCTGGTAATGGTGGTTTTCTTCATTGCATTCAGATGGATACATCTGTCAACGCCGCTAATCAGGTTGTTTCTGTTGGTGCTGATATTGCTTTTGATGCCGACCCTAAATTTTTTGCCTGTTTGGTTCGCTTTGAGTCTTCTTCGGTTCCGACTACCCTCCCGACTGCCTATGATGTTTATCCTTTGAATGGTCGCCATGATGGTGGTTATTATACCGTCAAGGACTGTGTGACTATTGACGTCCTTCCCCGTACGCCGGGCAATAACGTTTATGTTGGTTTCATGGTTTGGTCTAACTTTACCGCTACTAAATGCCGCGGATTGGTTTCGCTGAATCAGGTTATTAAAGAGATTATTTGTCTCCAGCCACTTAAGTGAGGTGATTTATGTTTGGTGCTATTGCTGGCGGTATTGCTTCTGCTCTTGCTGGTGGCGCCATGTCTAAATTGTTTGGAGGCGGTCAAAAAGCCGCCTCCGGTGGCATTCAAGGTGATGTGCTTGCTACCGATAACAATACTGTAGGCATGGGTGATGCTGGTATTAAATCTGCCATTCAAGGCTCTAATGTTCCTAACCCTGATGAGGCCGCCCCTAGTTTTGTTTCTGGTGCTATGGCTAAAGCTGGTAAAGGACTTCTTGAAGGTACGTTGCAGGCTGGCACTTCTGCCGTTTCTGATAAGTTGCTTGATTTGGTTGGACTTGGTGGCAAGTCTGCCGCTGATAAAGGAAAGGATACTCGTGATTATCTTGCTGCTGCATTTCCTGAGCTTAATGCTTGGGAGCGTGCTGGTGCTGATGCTTCCTCTGCTGGTATGGTTGACGCCGGATTTGAGAATCAAAAAGAGCTTACTAAAATGCAACTGGACAATCAGAAAGAGATTGCCGAGATGCAAAATGAGACTCAAAAAGAGATTGCTGGCATTCAGTCGGCGACTTCACGCCAGAATACGAAAGACCAGGTATATGCACAAAATGAGATGCTTGCTTATCAACAGAAGGAGTCTACTGCTCGCGTTGCGTCTATTATGGAAAACACCAATCTTTCCAAGCAACAGCAGGTTTCCGAGATTATGCGCCAAATGCTTACTCAAGCTCAAACGGCTGGTCAGTATTTTACCAATGACCAAATCAAAGAAATGACTCGCAAGGTTAGTGCTGAGGTTGACTTAGTTCATCAGCAAACGCAGAATCAGCGGTATGGCTCTTCTCATATTGGCGCTACTGCAAAGGATATTTCTAATGTCGTCACTGATGCTGCTTCTGGTGTGGTTGATATTTTTCATGGTATTGATAAAGCTGTTGCCGATACTTGGAACAATTTCTGGAAAGACGGTAAAGCTGATGGTATTGGCTCTAATTTGTCTAGGAAATAACCGTCAGGATTGACACCCTCCCAATTGTATGTTTTCATGCCTCCAAATCTTGGAGGCTTTTTTATGGTTCGTTCTTATTACCCTTCTGAATGTCACGCTGATTATTTTGACTTTGAGCGTATCGAGGCTCTTAAACCTGCTATTGAGGCTTGTGGCATTTCTACTCTTTCTCAATCCCCAATGCTTGGCTTCCATAAGCAGATGGATAACCGCATCAAGCTCTTGGAAGAGATTCTGTCTTTTCGTATGCAGGGCGTTGAGTTCGATAATGGTGATATGTATGTTGACGGCCATAAGGCTGCTTCTGACGTTCGTGATGAGTTTGTATCTGTTACTGAGAAGTTAATGGATGAATTGGCACAATGCTACAATGTGCTCCCCCAACTTGATATTAATAACACTATAGACCACCGCCCCGAAGGGGACGAAAAATGGTTTTTAGAGAACGAGAAGACGGTTACGCAGTTTTGCCGCAAGCTGGCTGCTGAACGCCCTCTTAAGGATATTCGCGATGAGTATAATTACCCCAAAAAGAAAGGTATTAAGGATGAGTGTTCAAGATTGCTGGAGGCCTCCACTATGAAATCGCGTAGAGGCTTTGCTATTCAGCGTTTGATGAATGCAATGCGACAGGCTCATGCTGATGGTTGGTTTATCGTTTTTGACACTCTCACGTTGGCTGACGACCGATTAGAGGCGTTTTATGATAATCCCAATGCTTTGCGTGACTATTTTCGTGATATTGGTCGTATGGTTCTTGCTGCCGAGGGTCGCAAGGCTAATGATTCACACGCCGACTGCTATCAGTATTTTTGTGTGCCTGAGTATGGTACAGCTAATGGCCGTCTTCATTTCCATGCGGTGCACTTTATGCGGACACTTCCTACAGGTAGCGTTGACCCTAATTTTGGTCGTCGGGTACGCAATCGCCGCCAGTTAAATAGCTTGCAAAATACGTGGCCTTATGGTTACAGTATGCCCATCGCAGTTCGCTACACGCAGGACGCTTTTTCACGTTCTGGTTGGTTGTGGCCTGTTGATGCTAAAGGTGAGCCGCTTAAAGCTACCAGTTATATGGCTGTTGGTTTCTATGTGGCTAAATACGTTAACAAAAAGTCAGATATGGACCTTGCTGCTAAAGGTCTAGGAGCTAAAGAATGGAACAACTCACTAAAAACCAAGCTGTCGCTACTTCCCAAGAAGCTGTTCAGAATCAGAATGAGCCGCAACTTCGGGATGAAAATGCTCACAATGACAAATCTGTCCACGGAGTGCTTAATCCAACTTACCAAGCTGGGTTACGACGCGACGCCGTTCAACCAGATATTGAAGCAGAACGCAAAAAGAGAGATGAGATTGAGGCTGGGAAAAGTTACTGTAGCCGACGTTTTGGCGGCGCAACCTGTGACGACAAATCTGCTCAAATTTATGCGCGCTTCGATAAAAATGATTGGCGTATCCAACCTGCAGAGTTTTATCGCTTCCATGACGCAGAAGTTAACACTTTCGGATATTTCTGATGAGTCGAAAAATTATCTTGATAAAGCAGGAATTACTACTGCTTGTTTACGAATTAAATCGAAGTGGACTGCTGGCGGAAAATGAGAAAATTCGACCTATCCTTGCGCAGCTCGAGAAGCTCTTACTTTGCGACCTTTCGCCATCAACTAACGATTCTGTCAAAAACTGACGCGTTGGATGAGGAGAAGTGGCTTAATATGCTTGGCACGTTCGTCAAGGACTGGTTTAGATATGAGTCACATTTTGTTCATGGTAGAGATTCTCTTGTTGACATTTTAAAAGAGCGTGGATTACTATCTGAGTCCGATGCTGTTCAACCACTAATAGGTAAGAAATCATGAGTCAAGTTACTGAACAATCCGTACGTTTCCAGACCGCTTTGGCCTCTATTAAGCTCATTCAGGCTTCTGCCGTTTTGGATTTAACCGAAGATGATTTCGATTTTCTGACGAGTAACAAAGTTTGGATTGCTACTGACCGCTCTCGTGCTCGTCGCTGCGTTGAGGCTTGCGTTTATGGTACGCTGGACTTTGTGGGATACCCTCGCTTTCCTGCTCCTGTTGAGTTTATTGCTGCCGTCATTGCTTATTATGTTCATCCCGTCAACATTCAAACGGCCTGTCTCATCATGGAAGGCGCTGAATTTACGGAAAACATTATTAATGGCGTCGAGCGTCCGGTTAAAGCCGCTGAATTGTTCGCGTTTACCTTGCGTGTACGCGCAGGAAACACTGACGTTCTTACTGACGCAGAAGAAAACGTGCGTCAAAAATTACGTGCGGAAGGAGTGATGTAATGTCTAAAGGTAAAAAACGTTCTGGCGCTCGCCCTGGTCGTCCGCAGCCGTTGCGAGGTACTAAAGGCAAGCGTAAAGGCGCTCGTCTTTGGTATGTAGGTGGTCAACAATTTTAATTGCAGGGGCTTCGGCCCCTTACTTGAGGATAAATTATGTCTAATATTCAAACTGGCGCCGAGCGTATGCCGCATGACCTTTCCCATCTTGGCTTCCTTGCTGGTCAGATTGGTCGTCTTATTACCATTTCAACTACTCCGGTTATCGCTGGCGACTCCTTCGAGATGGACGCCGTTGGCGCTCTCCGTCTTTCTCCATTGCGTCGTGGCCTTGCTATTGACTCTACTGTAGACATTTTTACTTTTTATGTCCCTCATCGTCACGTTTATGGTGAACAGTGGATTAAGTTCATGAAGGATGGTGTTAATGCCACTCCTCTCCCGACTGTTAACACTACTGGTTATATTGACCATGCCGCTTTTCTTGGCACGATTAACCCTGATACCAATAAAATCCCTAAGCATTTGTTTCAGGGTTATTTGAATATCTATAACAACTATTTTAAAGCGCCGTGGATGCCTGACCGTACCGAGGCTAACCCTAATGAGCTTAATCAAGATGATGCTCGTTATGGTTTCCGTTGCTGCCATCTCAAAAACATTTGGACTGCTCCGCTTCCTCCTGAGACTGAGCTTTCTCGCCAAATGACGACTTCTACCACATCTATTGACATTATGGGTCTGCAAGCTGCTTATGCTAATTTGCATACTGACCAAGAACGTGATTACTTCATGCAGCGTTACCATGATGTTATTTCTTCATTTGGAGGTAAAACCTCTTATGACGCTGACAACCGTCCTTTACTTGTCATGCGCTCTAATCTCTGGGCATCTGGCTATGATGTTGATGGAACTGACCAAACGTCGTTAGGCCAGTTTTCTGGTCGTGTTCAACAGACCTATAAACATTCTGTGCCGCGTTTCTTTGTTCCTGAGCATGGCACTATGTTTACTCTTGCGCTTGTTCGTTTTCCGCCTACTGCGACTAAAGAGATTCAGTACCTTAACGCTAAAGGTGCTTTGACTTATACCGATATTGCTGGCGACCCTGTTTTGTATGGCAACTTGCCGCCGCGTGAAATTTCTATGAAGGATGTTTTCCGTTCTGGTGATTCGTCTAAGAAGTTTAAGATTGCTGAGGGTCAGTGGTATCGTTATGCGCCTTCGTATGTTTCTCCTGCTTATCACCTTCTTGAAGGCTTCCCATTCATTCAGGAACCGCCTTCTGGTGATTTGCAAGAACGCGTACTTATTCGCCACCATGATTATGACCAGTGTTTCCAGTCCGTTCAGTTGTTGCAGTGGAATAGTCAGGTTAAATTTAATGTGACCGTTTATCGCAATCTGCCGACCACTCGCGATTCAATCATGACTTCGTGATAAAAGATTGA" + + +filter(x -> iscoding(sequence(x), η=1e-10) && length(x) > 100, findorfs(phi)) + +34-element Vector{ORF{4, NaiveFinder}}: + ORF{NaiveFinder}(636:1622, '+', 3) + ORF{NaiveFinder}(687:1622, '+', 3) + ORF{NaiveFinder}(774:1622, '+', 3) + ORF{NaiveFinder}(781:1389, '+', 1) + ORF{NaiveFinder}(814:1389, '+', 1) + ORF{NaiveFinder}(829:1389, '+', 1) + ORF{NaiveFinder}(861:1622, '+', 3) + ORF{NaiveFinder}(1021:1389, '+', 1) + ORF{NaiveFinder}(1386:1622, '+', 3) + ORF{NaiveFinder}(1447:1635, '+', 1) + ORF{NaiveFinder}(1489:1635, '+', 1) + ORF{NaiveFinder}(1501:1635, '+', 1) + ORF{NaiveFinder}(1531:1635, '+', 1) + ORF{NaiveFinder}(2697:3227, '+', 3) + ORF{NaiveFinder}(2745:3227, '+', 3) + ⋮ + ORF{NaiveFinder}(2874:3227, '+', 3) + ORF{NaiveFinder}(2973:3227, '+', 3) + ORF{NaiveFinder}(3108:3227, '+', 3) + ORF{NaiveFinder}(3142:3312, '+', 1) + ORF{NaiveFinder}(3481:3939, '+', 1) + ORF{NaiveFinder}(3659:3934, '+', 2) + ORF{NaiveFinder}(3734:3934, '+', 2) + ORF{NaiveFinder}(3772:3939, '+', 1) + ORF{NaiveFinder}(3806:3934, '+', 2) + ORF{NaiveFinder}(4129:4287, '+', 1) + ORF{NaiveFinder}(4160:4291, '-', 2) + ORF{NaiveFinder}(4540:4644, '+', 1) + ORF{NaiveFinder}(4690:4866, '+', 1) + ORF{NaiveFinder}(4741:4866, '+', 1) + ORF{NaiveFinder}(4744:4866, '+', 1) + ``` \ No newline at end of file From 91e245070573e9754901637dd92d8642dafa71d6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Camilo=20Garc=C3=ADa?= Date: Mon, 1 Jul 2024 14:38:29 -0500 Subject: [PATCH 36/40] Up deps and extras --- Manifest.toml | 2 +- Project.toml | 4 +++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/Manifest.toml b/Manifest.toml index 88633f8..a4ebd14 100644 --- a/Manifest.toml +++ b/Manifest.toml @@ -2,7 +2,7 @@ julia_version = "1.10.4" manifest_format = "2.0" -project_hash = "e13b3d62f031e8e91ec33e86f0dbf42fa3097ba0" +project_hash = "217dc6fb9e029f25cd73fc7c19c4343d692809b3" [[deps.Automa]] deps = ["PrecompileTools", "TranscodingStreams"] diff --git a/Project.toml b/Project.toml index 2fe732b..be9da5f 100644 --- a/Project.toml +++ b/Project.toml @@ -18,11 +18,13 @@ FASTX = "2" GenomicFeatures = "3" IterTools = "1.4" PrecompileTools = "1" -julia = "1" +julia = "1.6" [extras] Aqua = "4c88cf16-eb10-579e-8560-4a9242c79595" +StyledStrings = "f489334b-da3d-4c2e-b8f0-e476e12c162b" Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" +LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" [targets] test = ["Test", "Aqua"] From f7ae859147e137116a6ebd4ee565c6246d26fae1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Camilo=20Garc=C3=ADa?= Date: Mon, 1 Jul 2024 14:40:24 -0500 Subject: [PATCH 37/40] Add YAML extra --- Project.toml | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/Project.toml b/Project.toml index be9da5f..16ee731 100644 --- a/Project.toml +++ b/Project.toml @@ -22,9 +22,10 @@ julia = "1.6" [extras] Aqua = "4c88cf16-eb10-579e-8560-4a9242c79595" +LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" StyledStrings = "f489334b-da3d-4c2e-b8f0-e476e12c162b" Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" -LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" +YAML = "ddb6d928-2868-570f-bddf-ab3f9cf99eb6" [targets] -test = ["Test", "Aqua"] +test = ["Test", "Aqua", "LinearAlgebra", "StyledStrings", "YAML"] From 0ffa402e7278ac5a2a3aed02eb3b723a1ab4ade3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Camilo=20Garc=C3=ADa?= Date: Mon, 1 Jul 2024 14:43:33 -0500 Subject: [PATCH 38/40] Update gitignore and delete Manifest --- .gitignore | 3 +- Manifest.toml | 181 -------------------------------------------------- 2 files changed, 2 insertions(+), 182 deletions(-) delete mode 100644 Manifest.toml diff --git a/.gitignore b/.gitignore index 2c2e953..5ccfac7 100644 --- a/.gitignore +++ b/.gitignore @@ -5,4 +5,5 @@ .travis.yml /docs/src/*_cache /docs/src/*_files -/test/Manifest.toml \ No newline at end of file +/test/Manifest.toml +Manifest.toml \ No newline at end of file diff --git a/Manifest.toml b/Manifest.toml deleted file mode 100644 index a4ebd14..0000000 --- a/Manifest.toml +++ /dev/null @@ -1,181 +0,0 @@ -# This file is machine-generated - editing it directly is not advised - -julia_version = "1.10.4" -manifest_format = "2.0" -project_hash = "217dc6fb9e029f25cd73fc7c19c4343d692809b3" - -[[deps.Automa]] -deps = ["PrecompileTools", "TranscodingStreams"] -git-tree-sha1 = "588e0d680ad1d7201d4c6a804dcb1cd9cba79fbb" -uuid = "67c07d97-cdcb-5c2c-af73-a7f9c32a568b" -version = "1.0.3" - -[[deps.Base64]] -uuid = "2a0f44e3-6c83-55bd-87e4-b1978d98bd5f" - -[[deps.BioGenerics]] -deps = ["TranscodingStreams"] -git-tree-sha1 = "7bbc085aebc6faa615740b63756e4986c9e85a70" -uuid = "47718e42-2ac5-11e9-14af-e5595289c2ea" -version = "0.1.4" - -[[deps.BioMarkovChains]] -deps = ["BioSequences", "PrecompileTools", "VectorizedKmers"] -git-tree-sha1 = "ff9f9e36892cf21222f2f2a0fe709a61b5d8dbfe" -uuid = "f861b655-cb5f-42ce-b66a-341b542d4f2c" -version = "0.10.1" - - [deps.BioMarkovChains.extensions] - DiscreteMarkovChainsExt = "DiscreteMarkovChains" - MarkovChainHammerExt = "MarkovChainHammer" - - [deps.BioMarkovChains.weakdeps] - DiscreteMarkovChains = "8abcb7ef-b365-4f7b-ac38-56893fb62f9f" - MarkovChainHammer = "38c40fd0-bccb-4723-b30d-b2caea0ad8d9" - -[[deps.BioSequences]] -deps = ["BioSymbols", "PrecompileTools", "Random", "Twiddle"] -git-tree-sha1 = "6fdba8b4279460fef5674e9aa2dac7ef5be361d5" -uuid = "7e6ae17a-c86d-528c-b3b9-7f778a29fe59" -version = "3.1.6" - -[[deps.BioSymbols]] -deps = ["PrecompileTools"] -git-tree-sha1 = "e32a61f028b823a172c75e26865637249bb30dff" -uuid = "3c28c6f8-a34d-59c4-9654-267d177fcfa9" -version = "5.1.3" - -[[deps.Compat]] -deps = ["TOML", "UUIDs"] -git-tree-sha1 = "b1c55339b7c6c350ee89f2c1604299660525b248" -uuid = "34da2185-b29b-5c13-b0c7-acf172513d20" -version = "4.15.0" - - [deps.Compat.extensions] - CompatLinearAlgebraExt = "LinearAlgebra" - - [deps.Compat.weakdeps] - Dates = "ade2ca70-3891-5945-98fb-dc099432e06a" - LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" - -[[deps.DataStructures]] -deps = ["Compat", "InteractiveUtils", "OrderedCollections"] -git-tree-sha1 = "1d0a14036acb104d9e89698bd408f63ab58cdc82" -uuid = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8" -version = "0.18.20" - -[[deps.Dates]] -deps = ["Printf"] -uuid = "ade2ca70-3891-5945-98fb-dc099432e06a" - -[[deps.FASTX]] -deps = ["Automa", "BioGenerics", "PrecompileTools", "StringViews", "TranscodingStreams"] -git-tree-sha1 = "b998f092c8fe26f0ca417a12df18da5340153f34" -uuid = "c2308a5c-f048-11e8-3e8a-31650f418d12" -version = "2.1.6" -weakdeps = ["BioSequences"] - - [deps.FASTX.extensions] - BioSequencesExt = "BioSequences" - -[[deps.GenomicFeatures]] -deps = ["BioGenerics", "DataStructures", "IntervalTrees"] -git-tree-sha1 = "720844f71f118dd2ab4898a89b2b4feca7730d81" -uuid = "899a7d2d-5c61-547b-bef9-6698a8d05446" -version = "3.0.0" - -[[deps.InteractiveUtils]] -deps = ["Markdown"] -uuid = "b77e0a4c-d291-57a0-90e8-8db25a27a240" - -[[deps.IntervalTrees]] -git-tree-sha1 = "dc3b97bb5c9cb7c437f74027309f2c2f09a82aaf" -uuid = "524e6230-43b7-53ae-be76-1e9e4d08d11b" -version = "1.1.0" - -[[deps.IterTools]] -git-tree-sha1 = "42d5f897009e7ff2cf88db414a389e5ed1bdd023" -uuid = "c8e1da08-722c-5040-9ed9-7db0dc04731e" -version = "1.10.0" - -[[deps.Markdown]] -deps = ["Base64"] -uuid = "d6f4376e-aef5-505a-96c1-9c027394607a" - -[[deps.OrderedCollections]] -git-tree-sha1 = "dfdf5519f235516220579f949664f1bf44e741c5" -uuid = "bac558e1-5e72-5ebc-8fee-abe8a469f55d" -version = "1.6.3" - -[[deps.PrecompileTools]] -deps = ["Preferences"] -git-tree-sha1 = "5aa36f7049a63a1528fe8f7c3f2113413ffd4e1f" -uuid = "aea7be01-6a6a-4083-8856-8a6e6704d82a" -version = "1.2.1" - -[[deps.Preferences]] -deps = ["TOML"] -git-tree-sha1 = "9306f6085165d270f7e3db02af26a400d580f5c6" -uuid = "21216c6a-2e73-6563-6e65-726566657250" -version = "1.4.3" - -[[deps.Printf]] -deps = ["Unicode"] -uuid = "de0858da-6303-5e67-8744-51eddeeeb8d7" - -[[deps.Random]] -deps = ["SHA"] -uuid = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" - -[[deps.SHA]] -uuid = "ea8e919c-243c-51af-8825-aaa63cd721ce" -version = "0.7.0" - -[[deps.StaticArraysCore]] -git-tree-sha1 = "192954ef1208c7019899fbf8049e717f92959682" -uuid = "1e83bf80-4336-4d27-bf5d-d5a4f845583c" -version = "1.4.3" - -[[deps.StringViews]] -git-tree-sha1 = "f7b06677eae2571c888fd686ba88047d8738b0e3" -uuid = "354b36f9-a18e-4713-926e-db85100087ba" -version = "1.3.3" - -[[deps.TOML]] -deps = ["Dates"] -uuid = "fa267f1f-6049-4f14-aa54-33bafae1ed76" -version = "1.0.3" - -[[deps.TranscodingStreams]] -git-tree-sha1 = "d73336d81cafdc277ff45558bb7eaa2b04a8e472" -uuid = "3bb67fe8-82b1-5028-8e26-92a6c54297fa" -version = "0.10.10" - - [deps.TranscodingStreams.extensions] - TestExt = ["Test", "Random"] - - [deps.TranscodingStreams.weakdeps] - Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" - Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" - -[[deps.Twiddle]] -git-tree-sha1 = "29509c4862bfb5da9e76eb6937125ab93986270a" -uuid = "7200193e-83a8-5a55-b20d-5d36d44a0795" -version = "1.1.2" - -[[deps.UUIDs]] -deps = ["Random", "SHA"] -uuid = "cf7118a7-6976-5b1a-9a39-7adc72f591a4" - -[[deps.Unicode]] -uuid = "4ec0a83e-493e-50e2-b9ac-8f72acf5a8f5" - -[[deps.VectorizedKmers]] -deps = ["StaticArraysCore"] -git-tree-sha1 = "c2a7ec780efdf917088b5b65f78f402f72a2eb01" -uuid = "2ef45bd6-4c2b-43d8-88b3-40597287359a" -version = "0.9.2" -weakdeps = ["BioSequences"] - - [deps.VectorizedKmers.extensions] - BioSequencesExt = "BioSequences" From 929f482cb8055ac645d2a5b513533c9c1842fdcc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Camilo=20Garc=C3=ADa?= Date: Mon, 1 Jul 2024 14:47:16 -0500 Subject: [PATCH 39/40] Update actions cache and cov --- .github/workflows/CI.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml index 9f4837b..98e9876 100644 --- a/.github/workflows/CI.yml +++ b/.github/workflows/CI.yml @@ -35,11 +35,11 @@ jobs: with: version: ${{ matrix.version }} arch: ${{ matrix.arch }} - - uses: julia-actions/cache@v1 + - uses: julia-actions/cache@v2 - uses: julia-actions/julia-buildpkg@v1 - uses: julia-actions/julia-runtest@v1 - uses: julia-actions/julia-processcoverage@v1 - - uses: codecov/codecov-action@v3 + - uses: codecov/codecov-action@v4 with: files: lcov.info docs: From 67e046d9871e4e3553c06d444d304a2dcf729fa7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Camilo=20Garc=C3=ADa?= Date: Mon, 1 Jul 2024 14:51:22 -0500 Subject: [PATCH 40/40] refactor: Simplify NaiveCollector createorfs function --- src/algorithms/naivecollector.jl | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) diff --git a/src/algorithms/naivecollector.jl b/src/algorithms/naivecollector.jl index be46d60..678fe2f 100644 --- a/src/algorithms/naivecollector.jl +++ b/src/algorithms/naivecollector.jl @@ -19,10 +19,12 @@ function NaiveCollector( seqname = _varname(sequence) function createorfs(x, strand) + if length(x.captured[1]:x.captured[3]) < minlen return nothing end - if strand == '+' + + if strand == STRAND_POS start = x.captured[1] stop = x.captured[3] + 1 frame = framedict[x.captured[1] % 3] @@ -31,16 +33,25 @@ function NaiveCollector( stop = seqlen - x.captured[1] + 1 frame = framedict[(seqlen - x.captured[3]) % 3] end - seq = strand == '+' ? @view(sequence[start:stop]) : reverse_complement(@view(sequence[start:stop])) #@view(sequence[start:stop]) - scr = scheme === nothing ? 0.0 : scheme(seq; kwargs...) + + if scheme === nothing + scr = 0.0 + else + seq = strand == STRAND_POS ? @view(sequence[start:stop]) : reverse_complement(@view(sequence[start:stop])) + scr = scheme(seq; kwargs...) + end + + # seq = strand == STRAND_POS ? @view(sequence[start:stop]) : reverse_complement(@view(sequence[start:stop])) #@view(sequence[start:stop]) + # scr = scheme === nothing ? 0.0 : scheme(seq; kwargs...) # fts = Dict(:score => score) fts = Features((score = scr,)) # rbs = RBS(biore"RRR"dna, 3:4, 1.0) + return ORF{N,NaiveCollector}(seqname, start, stop, strand, frame, fts, scheme) # seq end orfs = Vector{ORF{N,NaiveCollector}}() - for strand in ('+', '-') - seq = strand == '-' ? revseq : sequence + for strand in (STRAND_POS, STRAND_NEG) + seq = strand == STRAND_NEG ? revseq : sequence matches = eachmatch(regorf, seq, overlap) strandedorfs = filter(!isnothing, [createorfs(x, strand) for x in matches]) append!(orfs, strandedorfs)