From bc999a070cb55e4f18712282413c84894b684ce2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Camilo=20Garc=C3=ADa?= Date: Wed, 14 Feb 2024 10:56:18 -0500 Subject: [PATCH 01/17] Comment some ideas for new implementations --- src/algorithms/findorfs.jl | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/src/algorithms/findorfs.jl b/src/algorithms/findorfs.jl index 7d059fd..f836e77 100644 --- a/src/algorithms/findorfs.jl +++ b/src/algorithms/findorfs.jl @@ -193,4 +193,20 @@ function record_orfs_faa( push!(records, record) end return records -end \ No newline at end of file +end + +## ideas to genefinder (this is the waay window function are implemented in the DSP.jl package) + +# function make_orf_finder(finderfunc::Function, sequence::NucleicSeqOrView{DNAAlphabet{N}}) where {N} +# ... +# end + +# function locationiterator(sequence::NucleicSeqOrView{DNAAlphabet{N}}) where {N} +# make_orf_finder(sequence::NucleicSeqOrView{DNAAlphabet{N}}) do x +# ... +# end +# end + +# function findorfs(sequence::NucleicSeqOrView{DNAAlphabet{N}}; finder::Function=locationiterator) where {N} +# ... +# end From bf77810ab03d702167e5aaf98f2dcb1c37788a70 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Camilo=20Garc=C3=ADa?= Date: Fri, 29 Mar 2024 13:01:52 -0500 Subject: [PATCH 02/17] Move get_orfs* methods to its own file --- src/getorfs.jl | 131 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 131 insertions(+) create mode 100644 src/getorfs.jl diff --git a/src/getorfs.jl b/src/getorfs.jl new file mode 100644 index 0000000..46887c2 --- /dev/null +++ b/src/getorfs.jl @@ -0,0 +1,131 @@ +export get_orfs_dna, get_orfs_aa, record_orfs_fna, record_orfs_faa + +#### get_orfs_* methods #### + +""" + get_orfs_dna(sequence::NucleicSeqOrView{DNAAlphabet{N}}, output::String; kwargs...) where {N} + +This function takes a `NucleicSeqOrView{DNAAlphabet{N}}` sequence and identifies the open reading frames (ORFs) using the `findorfs()` function. The function then extracts the DNA sequence of each ORF and stores it in a `Vector{LongSubSeq{DNAAlphabet{4}}}`. + +# Arguments + +- `sequence`: The input sequence as a `NucleicSeqOrView{DNAAlphabet{N}}` + +# Keyword Arguments + +- `alternative_start::Bool=false`: If set to `true`, the function considers alternative start codons when searching for ORFs. This increases the execution time by approximately 3x. +- `min_len::Int64=6`: The minimum length of the allowed ORF. By default, it allows ORFs that can encode at least one amino acid (e.g., `aa"M*"`). + +""" +function get_orfs_dna( + sequence::NucleicSeqOrView{DNAAlphabet{N}}; + alternative_start::Bool = false, + min_len::Int64 = 6 +) where {N} + orfs = findorfs(sequence; alternative_start, min_len) + seqs = Vector{LongSubSeq{DNAAlphabet{N}}}(undef, length(orfs)) #Vector{}(undef, length(orfs)) # todo correct the output type + @inbounds for (i, orf) in enumerate(orfs) + seqs[i] = sequence[orf] + end + return seqs +end + +""" + get_orfs_aa(sequence::NucleicSeqOrView{DNAAlphabet{N}}; kwargs...) where {N} + +This function takes a `NucleicSeqOrView{DNAAlphabet{N}}` sequence and identifies the open reading frames (ORFs) using the `findorfs()` function. The function then translates each ORF into an amino acid sequence and stores it in a `Vector{LongSubSeq{AminoAcidAlphabet}}`. + +# Arguments + +- `sequence`: The input sequence as a `NucleicSeqOrView{DNAAlphabet{N}}` + +# Keyword Arguments + +- `alternative_start::Bool=false`: If set to `true`, the function considers alternative start codons when searching for ORFs. This increases the execution time by approximately 3x. +- `min_len::Int64=6`: The minimum length of the allowed ORF. By default, it allows ORFs that can encode at least one amino acid (e.g., `aa"M*"`). + +""" +function get_orfs_aa( + sequence::NucleicSeqOrView{DNAAlphabet{N}}; + alternative_start::Bool = false, + code::GeneticCode = ncbi_trans_table[1], + min_len::Int64 = 6 +) where {N} + orfs = findorfs(sequence; alternative_start, min_len) + aas = Vector{LongSubSeq{AminoAcidAlphabet}}(undef, length(orfs)) + @inbounds for (i, orf) in enumerate(orfs) + aas[i] = translate(sequence[orf]; code) + end + return aas +end + +#### record_orfs_* methods #### + +""" + record_orfs_fna(sequence::NucleicSeqOrView{DNAAlphabet{N}}; kwargs...) where {N} + +Record Open Reading Frames (ORFs) in a nucleic acid sequence. + +# Arguments +- `sequence::NucleicSeqOrView{DNAAlphabet{N}}`: The nucleic acid sequence to search for ORFs. +- `alternative_start::Bool=false`: If `true`, consider alternative start codons for ORFs. +- `min_len::Int=6`: The minimum length of an ORF to be recorded. + +# Returns +An array of `FASTARecord` objects representing the identified ORFs. + +# Description +This function searches for Open Reading Frames (ORFs) in a given nucleic acid sequence. An ORF is a sequence of DNA that starts with a start codon and ends with a stop codon, without any other stop codons in between. By default, only the standard start codon (ATG) is considered, but if `alternative_start` is set to `true`, alternative start codons are also considered. The minimum length of an ORF to be recorded can be specified using the `min_len` argument. +""" +function record_orfs_fna( + sequence::NucleicSeqOrView{DNAAlphabet{N}}; + alternative_start::Bool = false, + min_len::Int64 = 6 +) where {N} + orfs = findorfs(sequence; alternative_start, min_len) + norfs = length(orfs) + padding = norfs < 10 ? length(string(norfs)) + 1 : length(string(norfs)) + records = FASTARecord[] + @inbounds for (index, orf) in enumerate(orfs) + id = string(lpad(string(index), padding, "0")) + header = "ORF$(id) id=$(id) start=$(orf.location.start) stop=$(orf.location.stop) strand=$(orf.strand) frame=$(orf.frame)" + record = FASTARecord(header, sequence[orf]) + push!(records, record) + end + return records +end + +""" + record_orfs_faa(sequence::NucleicSeqOrView{DNAAlphabet{N}}; kwargs...) where {N} + +This function takes a nucleic acid sequence and finds all open reading frames (ORFs) in the sequence. +An ORF is a sequence of DNA that starts with a start codon and ends with a stop codon. +The function returns a list of FASTA records, where each record represents an ORF in the sequence. + +# Arguments +- `sequence`: The nucleic acid sequence to search for ORFs. +- `alternative_start`: A boolean indicating whether to consider alternative start codons. Default is `false`. +- `code`: The genetic code to use for translation. Default is the NCBI translation table 1. +- `min_len`: The minimum length of an ORF. ORFs shorter than this length will be ignored. Default is 6. + +# Returns +- A list of FASTA records representing the ORFs found in the sequence. +""" +function record_orfs_faa( + sequence::NucleicSeqOrView{DNAAlphabet{N}}; + alternative_start::Bool = false, + code::GeneticCode = ncbi_trans_table[1], + min_len::Int64 = 6 +) where {N} + orfs = findorfs(sequence; alternative_start, min_len) + norfs = length(orfs) + padding = norfs < 10 ? length(string(norfs)) + 1 : length(string(norfs)) + records = FASTARecord[] + @inbounds for (index, orf) in enumerate(orfs) + id = string(lpad(string(index), padding, "0")) + header = "ORF$(id) id=$(id) start=$(orf.location.start) stop=$(orf.location.stop) strand=$(orf.strand) frame=$(orf.frame)" + record = FASTARecord(header, translate(sequence[orf]; code)) + push!(records, record) + end + return records +end \ No newline at end of file From 6187202d9e70b8b74fc6a753abdf8b66a2332e31 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Camilo=20Garc=C3=ADa?= Date: Fri, 29 Mar 2024 13:02:01 -0500 Subject: [PATCH 03/17] Update Julia version and package dependencies --- Manifest.toml | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/Manifest.toml b/Manifest.toml index 8a6348d..371ac23 100644 --- a/Manifest.toml +++ b/Manifest.toml @@ -1,6 +1,6 @@ # This file is machine-generated - editing it directly is not advised -julia_version = "1.10.0" +julia_version = "1.10.2" manifest_format = "2.0" project_hash = "4f6774eaeaec568c8cb7a865a84c0c7d9a180727" @@ -49,15 +49,15 @@ version = "1.10.0" [[deps.PrecompileTools]] deps = ["Preferences"] -git-tree-sha1 = "03b4c25b43cb84cee5c90aa9b5ea0a78fd848d2f" +git-tree-sha1 = "5aa36f7049a63a1528fe8f7c3f2113413ffd4e1f" uuid = "aea7be01-6a6a-4083-8856-8a6e6704d82a" -version = "1.2.0" +version = "1.2.1" [[deps.Preferences]] deps = ["TOML"] -git-tree-sha1 = "00805cd429dcb4870060ff49ef443486c262e38e" +git-tree-sha1 = "9306f6085165d270f7e3db02af26a400d580f5c6" uuid = "21216c6a-2e73-6563-6e65-726566657250" -version = "1.4.1" +version = "1.4.3" [[deps.Printf]] deps = ["Unicode"] @@ -82,9 +82,9 @@ uuid = "fa267f1f-6049-4f14-aa54-33bafae1ed76" version = "1.0.3" [[deps.TranscodingStreams]] -git-tree-sha1 = "54194d92959d8ebaa8e26227dbe3cdefcdcd594f" +git-tree-sha1 = "71509f04d045ec714c4748c785a59045c3736349" uuid = "3bb67fe8-82b1-5028-8e26-92a6c54297fa" -version = "0.10.3" +version = "0.10.7" [deps.TranscodingStreams.extensions] TestExt = ["Test", "Random"] From de55589c28e353ce360e98ce1b94a9d4384bf88d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Camilo=20Garc=C3=ADa?= Date: Fri, 29 Mar 2024 13:03:19 -0500 Subject: [PATCH 04/17] Main step of decoupling. The `findorfs` method now gains a `::Function` arg to call defined algorithms for ORF finding. --- src/algorithms/findorfs.jl | 212 ---------------------------------- src/algorithms/naivefinder.jl | 120 +++++++++++++++++++ src/findorfs.jl | 26 +++++ 3 files changed, 146 insertions(+), 212 deletions(-) delete mode 100644 src/algorithms/findorfs.jl create mode 100644 src/algorithms/naivefinder.jl create mode 100644 src/findorfs.jl diff --git a/src/algorithms/findorfs.jl b/src/algorithms/findorfs.jl deleted file mode 100644 index 9de8701..0000000 --- a/src/algorithms/findorfs.jl +++ /dev/null @@ -1,212 +0,0 @@ -""" - locationiterator(sequence::NucleicSeqOrView{DNAAlphabet{N}}; alternative_start::Bool=false) where {N} - -This is an iterator function that uses regular expressions to search the entire ORF (instead of start and stop codons) in a `LongSequence{DNAAlphabet{4}}` sequence. - It uses an anonymous function that will find the first regularly expressed ORF. Then using this anonymous function it creates an iterator that will apply it until there is no other CDS. - -!!! note - As a note of the implementation we want to expand on how the ORFs are found: - - The expression `(?:[N]{3})*?` serves as the boundary between the start and stop codons. Within this expression, the character class `[N]{3}` captures exactly three occurrences of any character (representing nucleotides using IUPAC codes). This portion functions as the regular codon matches. Since it is enclosed within a non-capturing group `(?:)` and followed by `*?`, it allows for the matching of intermediate codons, but with a preference for the smallest number of repetitions. - - In summary, the regular expression `ATG(?:[N]{3})*?T(AG|AA|GA)` identifies patterns that start with "ATG," followed by any number of three-character codons (represented by "N" in the IUPAC code), and ends with a stop codon "TAG," "TAA," or "TGA." This pattern is commonly used to identify potential protein-coding regions within genetic sequences. - - See more about the discussion [here](https://discourse.julialang.org/t/how-to-improve-a-generator-to-be-more-memory-efficient-when-it-is-collected/92932/8?u=camilogarciabotero) - -""" -function locationiterator( - sequence::NucleicSeqOrView{DNAAlphabet{N}}; - alternative_start::Bool = false -) where {N} - regorf = alternative_start ? biore"DTG(?:[N]{3})*?T(AG|AA|GA)"dna : biore"ATG(?:[N]{3})*?T(AG|AA|GA)"dna - # regorf = alternative_start ? biore"DTG(?:[N]{3})*?T(AG|AA|GA)"dna : biore"ATG([N]{3})*T(AG|AA|GA)?"dna # an attempt to make it non PCRE non-determinsitic - finder(x) = findfirst(regorf, sequence, first(x) + 1) # + 3 - itr = takewhile(!isnothing, iterated(finder, findfirst(regorf, sequence))) - return itr -end - -""" - findorfs(sequence::NucleicAlphabet{DNAAlphabet{N}}; alternative_start::Bool=false, min_len::Int64=6)::Vector{ORF} where {N} - -A simple implementation that finds ORFs in a DNA sequence. - -The `findorfs` function takes a LongSequence{DNAAlphabet{4}} sequence and returns a Vector{ORF} containing the ORFs found in the sequence. - It searches entire regularly expressed CDS, adding each ORF it finds to the vector. The function also searches the reverse complement of the sequence, so it finds ORFs on both strands. - Extending the starting codons with the `alternative_start = true` will search for ATG, GTG, and TTG. - Some studies have shown that in *E. coli* (K-12 strain), ATG, GTG and TTG are used 83 %, 14 % and 3 % respectively. -!!! note - This function has not ORFs scoring scheme. Thus it might consider `aa"M*"` a posible encoding protein from the resulting ORFs. - -# Keywords - -- `alternative_start::Bool=false`: If true will pass the extended start codons to search. This will increase 3x the exec. time. -- `min_len::Int64=6`: Length of the allowed ORF. Default value allow `aa"M*"` a posible encoding protein from the resulting ORFs. -""" -function findorfs( - sequence::NucleicSeqOrView{DNAAlphabet{N}}; - alternative_start::Bool = false, - min_len::Int64 = 6 -) where {N} - orfs = Vector{ORF}(undef, 0) - reversedseq = reverse_complement(sequence) - seqlen = length(sequence) - framedict = Dict(0 => 3, 1 => 1, 2 => 2) - - for strand in ('+', '-') - seq = strand == '-' ? reversedseq : sequence - - @inbounds for location in @views locationiterator(seq; alternative_start) - if length(location) >= min_len - frame = strand == '+' ? framedict[location.start % 3] : framedict[(seqlen - location.stop + 1) % 3] - push!(orfs, ORF(strand == '+' ? location : (seqlen - location.stop + 1):(seqlen - location.start + 1), strand, frame, 0.0)) - end - end - end - return sort(orfs) -end - -#### get_orfs_* methods #### - -""" - get_orfs_dna(sequence::NucleicSeqOrView{DNAAlphabet{N}}, output::String; kwargs...) where {N} - -This function takes a `NucleicSeqOrView{DNAAlphabet{N}}` sequence and identifies the open reading frames (ORFs) using the `findorfs()` function. The function then extracts the DNA sequence of each ORF and stores it in a `Vector{LongSubSeq{DNAAlphabet{4}}}`. - -# Arguments - -- `sequence`: The input sequence as a `NucleicSeqOrView{DNAAlphabet{N}}` - -# Keyword Arguments - -- `alternative_start::Bool=false`: If set to `true`, the function considers alternative start codons when searching for ORFs. This increases the execution time by approximately 3x. -- `min_len::Int64=6`: The minimum length of the allowed ORF. By default, it allows ORFs that can encode at least one amino acid (e.g., `aa"M*"`). - -""" -function get_orfs_dna( - sequence::NucleicSeqOrView{DNAAlphabet{N}}; - alternative_start::Bool = false, - min_len::Int64 = 6 -) where {N} - orfs = findorfs(sequence; alternative_start, min_len) - seqs = Vector{LongSubSeq{DNAAlphabet{N}}}(undef, length(orfs)) #Vector{}(undef, length(orfs)) # todo correct the output type - @inbounds for (i, orf) in enumerate(orfs) - seqs[i] = sequence[orf] - end - return seqs -end - -""" - get_orfs_aa(sequence::NucleicSeqOrView{DNAAlphabet{N}}; kwargs...) where {N} - -This function takes a `NucleicSeqOrView{DNAAlphabet{N}}` sequence and identifies the open reading frames (ORFs) using the `findorfs()` function. The function then translates each ORF into an amino acid sequence and stores it in a `Vector{LongSubSeq{AminoAcidAlphabet}}`. - -# Arguments - -- `sequence`: The input sequence as a `NucleicSeqOrView{DNAAlphabet{N}}` - -# Keyword Arguments - -- `alternative_start::Bool=false`: If set to `true`, the function considers alternative start codons when searching for ORFs. This increases the execution time by approximately 3x. -- `min_len::Int64=6`: The minimum length of the allowed ORF. By default, it allows ORFs that can encode at least one amino acid (e.g., `aa"M*"`). - -""" -function get_orfs_aa( - sequence::NucleicSeqOrView{DNAAlphabet{N}}; - alternative_start::Bool = false, - code::GeneticCode = ncbi_trans_table[1], - min_len::Int64 = 6 -) where {N} - orfs = findorfs(sequence; alternative_start, min_len) - aas = Vector{LongSubSeq{AminoAcidAlphabet}}(undef, length(orfs)) - @inbounds for (i, orf) in enumerate(orfs) - aas[i] = translate(sequence[orf]; code) - end - return aas -end - -#### record_orfs_* methods #### - -""" - record_orfs_fna(sequence::NucleicSeqOrView{DNAAlphabet{N}}; kwargs...) where {N} - -Record Open Reading Frames (ORFs) in a nucleic acid sequence. - -# Arguments -- `sequence::NucleicSeqOrView{DNAAlphabet{N}}`: The nucleic acid sequence to search for ORFs. -- `alternative_start::Bool=false`: If `true`, consider alternative start codons for ORFs. -- `min_len::Int=6`: The minimum length of an ORF to be recorded. - -# Returns -An array of `FASTARecord` objects representing the identified ORFs. - -# Description -This function searches for Open Reading Frames (ORFs) in a given nucleic acid sequence. An ORF is a sequence of DNA that starts with a start codon and ends with a stop codon, without any other stop codons in between. By default, only the standard start codon (ATG) is considered, but if `alternative_start` is set to `true`, alternative start codons are also considered. The minimum length of an ORF to be recorded can be specified using the `min_len` argument. -""" -function record_orfs_fna( - sequence::NucleicSeqOrView{DNAAlphabet{N}}; - alternative_start::Bool = false, - min_len::Int64 = 6 -) where {N} - orfs = findorfs(sequence; alternative_start, min_len) - norfs = length(orfs) - padding = norfs < 10 ? length(string(norfs)) + 1 : length(string(norfs)) - records = FASTARecord[] - @inbounds for (index, orf) in enumerate(orfs) - id = string(lpad(string(index), padding, "0")) - header = "ORF$(id) id=$(id) start=$(orf.location.start) stop=$(orf.location.stop) strand=$(orf.strand) frame=$(orf.frame)" - record = FASTARecord(header, sequence[orf]) - push!(records, record) - end - return records -end - -""" - record_orfs_faa(sequence::NucleicSeqOrView{DNAAlphabet{N}}; kwargs...) where {N} - -This function takes a nucleic acid sequence and finds all open reading frames (ORFs) in the sequence. -An ORF is a sequence of DNA that starts with a start codon and ends with a stop codon. -The function returns a list of FASTA records, where each record represents an ORF in the sequence. - -# Arguments -- `sequence`: The nucleic acid sequence to search for ORFs. -- `alternative_start`: A boolean indicating whether to consider alternative start codons. Default is `false`. -- `code`: The genetic code to use for translation. Default is the NCBI translation table 1. -- `min_len`: The minimum length of an ORF. ORFs shorter than this length will be ignored. Default is 6. - -# Returns -- A list of FASTA records representing the ORFs found in the sequence. -""" -function record_orfs_faa( - sequence::NucleicSeqOrView{DNAAlphabet{N}}; - alternative_start::Bool = false, - code::GeneticCode = ncbi_trans_table[1], - min_len::Int64 = 6 -) where {N} - orfs = findorfs(sequence; alternative_start, min_len) - norfs = length(orfs) - padding = norfs < 10 ? length(string(norfs)) + 1 : length(string(norfs)) - records = FASTARecord[] - @inbounds for (index, orf) in enumerate(orfs) - id = string(lpad(string(index), padding, "0")) - header = "ORF$(id) id=$(id) start=$(orf.location.start) stop=$(orf.location.stop) strand=$(orf.strand) frame=$(orf.frame)" - record = FASTARecord(header, translate(sequence[orf]; code)) - push!(records, record) - end - return records -end - -## ideas to genefinder (this is the waay window function are implemented in the DSP.jl package) - -# function make_orf_finder(finderfunc::Function, sequence::NucleicSeqOrView{DNAAlphabet{N}}) where {N} -# ... -# end - -# function locationiterator(sequence::NucleicSeqOrView{DNAAlphabet{N}}) where {N} -# make_orf_finder(sequence::NucleicSeqOrView{DNAAlphabet{N}}) do x -# ... -# end -# end - -# function findorfs(sequence::NucleicSeqOrView{DNAAlphabet{N}}; finder::Function=locationiterator) where {N} -# ... -# end diff --git a/src/algorithms/naivefinder.jl b/src/algorithms/naivefinder.jl new file mode 100644 index 0000000..0e94d9f --- /dev/null +++ b/src/algorithms/naivefinder.jl @@ -0,0 +1,120 @@ +export naivefinder + +""" + locationiterator(sequence::NucleicSeqOrView{DNAAlphabet{N}}; alternative_start::Bool=false) where {N} + +This is an iterator function that uses regular expressions to search the entire ORF (instead of start and stop codons) in a `LongSequence{DNAAlphabet{4}}` sequence. + It uses an anonymous function that will find the first regularly expressed ORF. Then using this anonymous function it creates an iterator that will apply it until there is no other CDS. + +!!! note + As a note of the implementation we want to expand on how the ORFs are found: + + The expression `(?:[N]{3})*?` serves as the boundary between the start and stop codons. Within this expression, the character class `[N]{3}` captures exactly three occurrences of any character (representing nucleotides using IUPAC codes). This portion functions as the regular codon matches. Since it is enclosed within a non-capturing group `(?:)` and followed by `*?`, it allows for the matching of intermediate codons, but with a preference for the smallest number of repetitions. + + In summary, the regular expression `ATG(?:[N]{3})*?T(AG|AA|GA)` identifies patterns that start with "ATG," followed by any number of three-character codons (represented by "N" in the IUPAC code), and ends with a stop codon "TAG," "TAA," or "TGA." This pattern is commonly used to identify potential protein-coding regions within genetic sequences. + + See more about the discussion [here](https://discourse.julialang.org/t/how-to-improve-a-generator-to-be-more-memory-efficient-when-it-is-collected/92932/8?u=camilogarciabotero) + +""" +function _locationiterator( + sequence::NucleicSeqOrView{DNAAlphabet{N}}; + alternative_start::Bool = false +) where {N} + regorf = alternative_start ? biore"DTG(?:[N]{3})*?T(AG|AA|GA)"dna : biore"ATG(?:[N]{3})*?T(AG|AA|GA)"dna + # regorf = alternative_start ? biore"DTG(?:[N]{3})*?T(AG|AA|GA)"dna : biore"ATG([N]{3})*T(AG|AA|GA)?"dna # an attempt to make it non PCRE non-determinsitic + finder(x) = findfirst(regorf, sequence, first(x) + 1) # + 3 + itr = takewhile(!isnothing, iterated(finder, findfirst(regorf, sequence))) + return itr +end + +""" +naivefinder(sequence::NucleicAlphabet{DNAAlphabet{N}}; alternative_start::Bool=false, min_len::Int64=6)::Vector{ORF} where {N} + +A simple implementation that finds ORFs in a DNA sequence. + +The `naivefinder` function takes a LongSequence{DNAAlphabet{4}} sequence and returns a Vector{ORF} containing the ORFs found in the sequence. + It searches entire regularly expressed CDS, adding each ORF it finds to the vector. The function also searches the reverse complement of the sequence, so it finds ORFs on both strands. + Extending the starting codons with the `alternative_start = true` will search for ATG, GTG, and TTG. + Some studies have shown that in *E. coli* (K-12 strain), ATG, GTG and TTG are used 83 %, 14 % and 3 % respectively. +!!! note + This function has not ORFs scoring scheme. Thus it might consider `aa"M*"` a posible encoding protein from the resulting ORFs. + +# Keywords + +- `alternative_start::Bool=false`: If true will pass the extended start codons to search. This will increase 3x the exec. time. +- `min_len::Int64=6`: Length of the allowed ORF. Default value allow `aa"M*"` a posible encoding protein from the resulting ORFs. +""" +function naivefinder( + sequence::NucleicSeqOrView{DNAAlphabet{N}}; + alternative_start::Bool = false, + min_len::Int64 = 6 +) where {N} + orfs = Vector{ORF}(undef, 0) + reversedseq = reverse_complement(sequence) + seqlen = length(sequence) + framedict = Dict(0 => 3, 1 => 1, 2 => 2) + + for strand in ('+', '-') + seq = strand == '-' ? reversedseq : sequence + + @inbounds for location in @views _locationiterator(seq; alternative_start) + if length(location) >= min_len + frame = strand == '+' ? framedict[location.start % 3] : framedict[(seqlen - location.stop + 1) % 3] + push!(orfs, ORF(strand == '+' ? location : (seqlen - location.stop + 1):(seqlen - location.start + 1), strand, frame, 0.0)) + end + end + end + return sort(orfs) +end + + +## Another alternative: + +# function _locationiterator( +# sequence::NucleicSeqOrView{DNAAlphabet{N}}; +# alternative_start::Bool = false +# ) where {N} +# regorf = alternative_start ? biore"DTG(?:[N]{3})*?T(AG|AA|GA)"dna : biore"ATG(?:[N]{3})*?T(AG|AA|GA)"dna +# finder(x) = findfirst(regorf, sequence, first(x) + 1) # + 3 +# itr = takewhile(!isnothing, iterated(finder, findfirst(regorf, sequence))) +# return itr +# end + +# function naivefinder( +# sequence::NucleicSeqOrView{DNAAlphabet{N}}; +# alternative_start::Bool = false, +# min_len::Int64 = 6 +# ) where {N} +# orfs = Vector{ORF}(undef, 0) +# reversedseq = reverse_complement(sequence) +# seqlen = length(sequence) +# framedict = Dict(0 => 3, 1 => 1, 2 => 2) + +# for strand in ('+', '-') +# seq = strand == '-' ? reversedseq : sequence + +# @inbounds for location in @views _locationiterator(seq; alternative_start) +# if length(location) >= min_len +# frame = strand == '+' ? framedict[location.start % 3] : framedict[(seqlen - location.stop + 1) % 3] +# push!(orfs, ORF(strand == '+' ? location : (seqlen - location.stop + 1):(seqlen - location.start + 1), strand, frame, 0.0)) +# end +# end +# end +# return sort(orfs) +# end + +## Ideas to genefinder (this is the way window function are implemented in the DSP.jl package) + +# function make_orf_finder(finderfunc::Function, sequence::NucleicSeqOrView{DNAAlphabet{N}}) where {N} +# ... +# end + +# function locationiterator(sequence::NucleicSeqOrView{DNAAlphabet{N}}) where {N} +# make_orf_finder(sequence::NucleicSeqOrView{DNAAlphabet{N}}) do x +# ... +# end +# end + +# function findorfs(sequence::NucleicSeqOrView{DNAAlphabet{N}}; finder::Function=locationiterator) where {N} +# ... +# end \ No newline at end of file diff --git a/src/findorfs.jl b/src/findorfs.jl new file mode 100644 index 0000000..f22f5cc --- /dev/null +++ b/src/findorfs.jl @@ -0,0 +1,26 @@ +export findorfs + +""" + findorfs(sequence::NucleicSeqOrView{DNAAlphabet{N}}; findermethod=naivefinder, alternative_start=false, min_len=6) where {N} + +This is the main interface method for finding open reading frames (ORFs) in a DNA sequence. + + It takes the following arguments: +- `sequence`: The nucleic acid sequence to search for ORFs. +- `findermethod`: The algorithm used to find ORFs. Default is `naivefinder`. +- `alternative_start`: A boolean indicating whether to consider alternative start codons. Default is `false`. +- `min_len`: The minimum length of an ORF. Default is `6`. + +Returns a vector of `ORF` objects representing the found ORFs. +""" +function findorfs( + sequence::NucleicSeqOrView{DNAAlphabet{N}}; + findermethod::Function = naivefinder, + alternative_start::Bool = false, + min_len::Int64 = 6 + ) where {N} + + orfs = findermethod(sequence; alternative_start, min_len)::Vector{ORF} + + return orfs +end \ No newline at end of file From 3f0246e12907840d936eeaf12851bb5d5c597056 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Camilo=20Garc=C3=ADa?= Date: Fri, 29 Mar 2024 13:03:38 -0500 Subject: [PATCH 05/17] Update exports to file level --- src/GeneFinder.jl | 12 +++--------- src/io.jl | 2 ++ src/types.jl | 3 +++ src/utils.jl | 1 + 4 files changed, 9 insertions(+), 9 deletions(-) diff --git a/src/GeneFinder.jl b/src/GeneFinder.jl index 4d0304d..01959a5 100644 --- a/src/GeneFinder.jl +++ b/src/GeneFinder.jl @@ -24,18 +24,12 @@ using FASTX: FASTAReader, FASTARecord, description, sequence using IterTools: takewhile, iterated using PrecompileTools: @setup_workload, @compile_workload +include("algorithms/naivefinder.jl") include("types.jl") -export ORF - -include("algorithms/findorfs.jl") -export locationiterator, findorfs, get_orfs_dna, get_orfs_aa, record_orfs_fna, record_orfs_faa - +include("findorfs.jl") +include("getorfs.jl") include("io.jl") -export write_orfs_fna, write_orfs_faa, write_orfs_bed, write_orfs_gff - include("utils.jl") -export fasta_to_dna, hasprematurestop - include("extended.jl") @setup_workload begin diff --git a/src/io.jl b/src/io.jl index e615b13..b5432c3 100644 --- a/src/io.jl +++ b/src/io.jl @@ -1,3 +1,5 @@ +export write_orfs_fna, write_orfs_faa, write_orfs_bed, write_orfs_gff + """ write_orfs_bed(input::NucleicAcidAlphabet{DNAAlphabet{N}}, output::Union{IOStream, IOBuffer}; kwargs...) where {N} write_orfs_bed(input::NucleicAcidAlphabet{DNAAlphabet{N}}, output::String; kwargs...) where {N} diff --git a/src/types.jl b/src/types.jl index 70c7e57..bda70f6 100644 --- a/src/types.jl +++ b/src/types.jl @@ -1,3 +1,5 @@ +export ORF + # Structs associated with gene models abstract type AbstractGene end @@ -48,6 +50,7 @@ The ORF struct represents an open reading frame in a DNA sequence. It has two fi - `score`: is a Union{Nothing, Float64} type indicating the score of the ORF. It can be a Float64 or nothing. """ struct ORF <: AbstractGene + #TODOs: location might be a complex Union allowing UnitRange or a Join of ranges (e.g. 1..100, 200..300)? location::UnitRange{Int64} # Note that it is also called position for gene struct in GenomicAnotations strand::Char frame::Int # Use Int64 instead of Int diff --git a/src/utils.jl b/src/utils.jl index c0d9d39..ea57668 100644 --- a/src/utils.jl +++ b/src/utils.jl @@ -1,3 +1,4 @@ +export fasta_to_dna, hasprematurestop # General purposes methods supporting main functions """ fasta_to_dna(input::String) From be321ca2280e120b3079470aea20967fe3b40687 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Camilo=20Garc=C3=ADa?= Date: Fri, 29 Mar 2024 13:17:44 -0500 Subject: [PATCH 06/17] Update ORF outputs in docs and README --- README.md | 24 ++++++++++++------------ docs/src/simplefinder.md | 24 ++++++++++++------------ 2 files changed, 24 insertions(+), 24 deletions(-) diff --git a/README.md b/README.md index fe1a30a..7b0eddb 100644 --- a/README.md +++ b/README.md @@ -53,18 +53,18 @@ Now lest us find the ORFs findorfs(seq) 12-element Vector{ORF}: - ORF(29:40, '+', 2) - ORF(137:145, '+', 2) - ORF(164:184, '+', 2) - ORF(173:184, '+', 2) - ORF(236:241, '+', 2) - ORF(248:268, '+', 2) - ORF(362:373, '+', 2) - ORF(470:496, '+', 2) - ORF(551:574, '+', 2) - ORF(569:574, '+', 2) - ORF(581:601, '+', 2) - ORF(695:706, '+', 2) + ORF(29:40, '+', 2, 0.0) + ORF(137:145, '+', 2, 0.0) + ORF(164:184, '+', 2, 0.0) + ORF(173:184, '+', 2, 0.0) + ORF(236:241, '+', 2, 0.0) + ORF(248:268, '+', 2, 0.0) + ORF(362:373, '+', 2, 0.0) + ORF(470:496, '+', 2, 0.0) + ORF(551:574, '+', 2, 0.0) + ORF(569:574, '+', 2, 0.0) + ORF(581:601, '+', 2, 0.0) + ORF(695:706, '+', 2, 0.0) ``` Two other functions (`get_orfs_dna` and `get_orfs_aa`) are implemented to get the ORFs in DNA and amino acid sequences, respectively. They use the `findorfs` function to first get the ORFs and then get the correspondance array of `BioSequence` objects. diff --git a/docs/src/simplefinder.md b/docs/src/simplefinder.md index faac22a..05ee696 100644 --- a/docs/src/simplefinder.md +++ b/docs/src/simplefinder.md @@ -15,18 +15,18 @@ Now lest us find the ORFs findorfs(seq) 12-element Vector{ORF}: - ORF(29:40, '+', 2) - ORF(137:145, '+', 2) - ORF(164:184, '+', 2) - ORF(173:184, '+', 2) - ORF(236:241, '+', 2) - ORF(248:268, '+', 2) - ORF(362:373, '+', 2) - ORF(470:496, '+', 2) - ORF(551:574, '+', 2) - ORF(569:574, '+', 2) - ORF(581:601, '+', 2) - ORF(695:706, '+', 2) + ORF(29:40, '+', 2, 0.0) + ORF(137:145, '+', 2, 0.0) + ORF(164:184, '+', 2, 0.0) + ORF(173:184, '+', 2, 0.0) + ORF(236:241, '+', 2, 0.0) + ORF(248:268, '+', 2, 0.0) + ORF(362:373, '+', 2, 0.0) + ORF(470:496, '+', 2, 0.0) + ORF(551:574, '+', 2, 0.0) + ORF(569:574, '+', 2, 0.0) + ORF(581:601, '+', 2, 0.0) + ORF(695:706, '+', 2, 0.0) ``` Two other functions (`get_orfs_dna` and `get_orfs_aa`) are implemented to get the ORFs in DNA and amino acid sequences, respectively. They use the `findorfs` function to first get the ORFs and then get the correspondance array of `BioSequence` objects. From b2080ab3a862cb33942422dca2f257f392a8c56f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Camilo=20Garc=C3=ADa?= Date: Fri, 29 Mar 2024 16:28:27 -0500 Subject: [PATCH 07/17] Add todos in getindex --- src/extended.jl | 1 + 1 file changed, 1 insertion(+) diff --git a/src/extended.jl b/src/extended.jl index 8fb8854..354fca3 100644 --- a/src/extended.jl +++ b/src/extended.jl @@ -2,6 +2,7 @@ import Base: length, iterate, sort, getindex Base.sort(v::Vector{<:ORF}) = sort(v, by = _orf_sort_key) +#TODOs: how to make more robust the getindex method? confroning the frames? Base.getindex(sequence::NucleicSeqOrView{A}, orf::ORF) where {A} = orf.strand == '+' ? (@view sequence[orf.location]) : reverse_complement(@view sequence[orf.location]) function _orf_sort_key(orf::ORF) From 963764af931bdb11e0b7dec40e35d2862b061b21 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Camilo=20Garc=C3=ADa?= Date: Fri, 29 Mar 2024 16:28:34 -0500 Subject: [PATCH 08/17] Refactor naivefinder function and include getindextest.jl --- src/algorithms/naivefinder.jl | 27 +++++++++------------------ src/findorfs.jl | 3 +-- test/getindextest.jl | 11 +++++++++++ test/runtests.jl | 1 + 4 files changed, 22 insertions(+), 20 deletions(-) create mode 100644 test/getindextest.jl diff --git a/src/algorithms/naivefinder.jl b/src/algorithms/naivefinder.jl index 0e94d9f..69b4951 100644 --- a/src/algorithms/naivefinder.jl +++ b/src/algorithms/naivefinder.jl @@ -67,43 +67,34 @@ function naivefinder( return sort(orfs) end - -## Another alternative: - -# function _locationiterator( -# sequence::NucleicSeqOrView{DNAAlphabet{N}}; -# alternative_start::Bool = false -# ) where {N} -# regorf = alternative_start ? biore"DTG(?:[N]{3})*?T(AG|AA|GA)"dna : biore"ATG(?:[N]{3})*?T(AG|AA|GA)"dna -# finder(x) = findfirst(regorf, sequence, first(x) + 1) # + 3 -# itr = takewhile(!isnothing, iterated(finder, findfirst(regorf, sequence))) -# return itr -# end - # function naivefinder( # sequence::NucleicSeqOrView{DNAAlphabet{N}}; # alternative_start::Bool = false, # min_len::Int64 = 6 # ) where {N} -# orfs = Vector{ORF}(undef, 0) -# reversedseq = reverse_complement(sequence) # seqlen = length(sequence) # framedict = Dict(0 => 3, 1 => 1, 2 => 2) +# orfs = Vector{ORF}() # for strand in ('+', '-') -# seq = strand == '-' ? reversedseq : sequence +# seq = strand == '-' ? reverse_complement(sequence) : sequence # @inbounds for location in @views _locationiterator(seq; alternative_start) # if length(location) >= min_len # frame = strand == '+' ? framedict[location.start % 3] : framedict[(seqlen - location.stop + 1) % 3] -# push!(orfs, ORF(strand == '+' ? location : (seqlen - location.stop + 1):(seqlen - location.start + 1), strand, frame, 0.0)) +# start = strand == '+' ? location.start : seqlen - location.stop + 1 +# stop = start + length(location) - 1 +# push!(orfs, ORF(start:stop, strand, frame, 0.0)) # end # end # end + # return sort(orfs) # end -## Ideas to genefinder (this is the way window function are implemented in the DSP.jl package) +## Another alternative: + +## This is the way window function are implemented in the DSP.jl package # function make_orf_finder(finderfunc::Function, sequence::NucleicSeqOrView{DNAAlphabet{N}}) where {N} # ... diff --git a/src/findorfs.jl b/src/findorfs.jl index f22f5cc..1b5fedc 100644 --- a/src/findorfs.jl +++ b/src/findorfs.jl @@ -20,7 +20,6 @@ function findorfs( min_len::Int64 = 6 ) where {N} - orfs = findermethod(sequence; alternative_start, min_len)::Vector{ORF} + return findermethod(sequence; alternative_start, min_len)#::Vector{ORF} - return orfs end \ No newline at end of file diff --git a/test/getindextest.jl b/test/getindextest.jl new file mode 100644 index 0000000..e961b89 --- /dev/null +++ b/test/getindextest.jl @@ -0,0 +1,11 @@ +# Import the function to be tested + +# Run the tests +@testset "getindex" begin + seq01 = dna"ATGATGCATGCATGCATGCTAGTAACTAGCTAGCTAGCTAGTAA" + + orfs01 = [ORF(1:33, '+', 1, 0.0), ORF(4:33, '+', 1, 0.0), ORF(8:22, '+', 2, 0.0), ORF(12:29, '+', 3, 0.0), ORF(16:33, '+', 1, 0.0)] + + @test getindex(seq01, orfs01[1]) == dna"ATGATGCATGCATGCATGCTAGTAACTAGCTAG" + @test getindex(seq01, orfs01[2]) == dna"ATGCATGCATGCATGCTAGTAACTAGCTAG" +end \ No newline at end of file diff --git a/test/runtests.jl b/test/runtests.jl index a322c3b..d89a385 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -8,6 +8,7 @@ using Aqua include("findorfstest.jl") include("iotest.jl") +include("getindextest.jl") include("aquatest.jl") end From 91761379511f97b6a4d0c83a2da1177ec6ff279c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Camilo=20Garc=C3=ADa?= Date: Fri, 29 Mar 2024 16:31:49 -0500 Subject: [PATCH 09/17] Refactor naivefinder function to improve readability and performance --- src/algorithms/naivefinder.jl | 34 +++++----------------------------- 1 file changed, 5 insertions(+), 29 deletions(-) diff --git a/src/algorithms/naivefinder.jl b/src/algorithms/naivefinder.jl index 69b4951..2b0c93a 100644 --- a/src/algorithms/naivefinder.jl +++ b/src/algorithms/naivefinder.jl @@ -49,49 +49,25 @@ function naivefinder( alternative_start::Bool = false, min_len::Int64 = 6 ) where {N} - orfs = Vector{ORF}(undef, 0) - reversedseq = reverse_complement(sequence) seqlen = length(sequence) framedict = Dict(0 => 3, 1 => 1, 2 => 2) + orfs = Vector{ORF}() for strand in ('+', '-') - seq = strand == '-' ? reversedseq : sequence + seq = strand == '-' ? reverse_complement(sequence) : sequence @inbounds for location in @views _locationiterator(seq; alternative_start) if length(location) >= min_len frame = strand == '+' ? framedict[location.start % 3] : framedict[(seqlen - location.stop + 1) % 3] - push!(orfs, ORF(strand == '+' ? location : (seqlen - location.stop + 1):(seqlen - location.start + 1), strand, frame, 0.0)) + start = strand == '+' ? location.start : seqlen - location.stop + 1 + stop = start + length(location) - 1 + push!(orfs, ORF(start:stop, strand, frame, 0.0)) end end end return sort(orfs) end -# function naivefinder( -# sequence::NucleicSeqOrView{DNAAlphabet{N}}; -# alternative_start::Bool = false, -# min_len::Int64 = 6 -# ) where {N} -# seqlen = length(sequence) -# framedict = Dict(0 => 3, 1 => 1, 2 => 2) -# orfs = Vector{ORF}() - -# for strand in ('+', '-') -# seq = strand == '-' ? reverse_complement(sequence) : sequence - -# @inbounds for location in @views _locationiterator(seq; alternative_start) -# if length(location) >= min_len -# frame = strand == '+' ? framedict[location.start % 3] : framedict[(seqlen - location.stop + 1) % 3] -# start = strand == '+' ? location.start : seqlen - location.stop + 1 -# stop = start + length(location) - 1 -# push!(orfs, ORF(start:stop, strand, frame, 0.0)) -# end -# end -# end - -# return sort(orfs) -# end - ## Another alternative: ## This is the way window function are implemented in the DSP.jl package From 9dea1ce2395089e89aa65439309e9c3ec63e9f63 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Camilo=20Garc=C3=ADa?= Date: Fri, 29 Mar 2024 16:32:17 -0500 Subject: [PATCH 10/17] Fix return type annotation in findorfs function --- src/findorfs.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/findorfs.jl b/src/findorfs.jl index 1b5fedc..c18b357 100644 --- a/src/findorfs.jl +++ b/src/findorfs.jl @@ -20,6 +20,6 @@ function findorfs( min_len::Int64 = 6 ) where {N} - return findermethod(sequence; alternative_start, min_len)#::Vector{ORF} + return findermethod(sequence; alternative_start, min_len)::Vector{ORF} end \ No newline at end of file From ab25b99c7de89d07b52d620dcc59e1e0cec1f722 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Camilo=20Garc=C3=ADa?= Date: Fri, 29 Mar 2024 17:51:42 -0500 Subject: [PATCH 11/17] Update .gitignore and add test Project.toml --- .gitignore | 3 ++- test/Project.toml | 5 +++++ 2 files changed, 7 insertions(+), 1 deletion(-) create mode 100644 test/Project.toml diff --git a/.gitignore b/.gitignore index 9dd02c5..2c2e953 100644 --- a/.gitignore +++ b/.gitignore @@ -4,4 +4,5 @@ /docs/build/ .travis.yml /docs/src/*_cache -/docs/src/*_files \ No newline at end of file +/docs/src/*_files +/test/Manifest.toml \ No newline at end of file diff --git a/test/Project.toml b/test/Project.toml new file mode 100644 index 0000000..60aa073 --- /dev/null +++ b/test/Project.toml @@ -0,0 +1,5 @@ +[deps] +Aqua = "4c88cf16-eb10-579e-8560-4a9242c79595" +BioSequences = "7e6ae17a-c86d-528c-b3b9-7f778a29fe59" +FASTX = "c2308a5c-f048-11e8-3e8a-31650f418d12" +Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" From 78434d0a6c82ea2cb6e3f8f885dc90da3ead4130 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Camilo=20Garc=C3=ADa?= Date: Fri, 29 Mar 2024 17:52:27 -0500 Subject: [PATCH 12/17] Remove FASTX methods and make it lighter --- Manifest.toml | 41 +-------------------------- Project.toml | 2 -- src/GeneFinder.jl | 2 +- src/getorfs.jl | 72 +++++++++++++++++++++++------------------------ src/utils.jl | 46 +++++++++++++++--------------- test/runtests.jl | 11 ++++++++ 6 files changed, 72 insertions(+), 102 deletions(-) diff --git a/Manifest.toml b/Manifest.toml index 371ac23..00b6976 100644 --- a/Manifest.toml +++ b/Manifest.toml @@ -2,19 +2,7 @@ julia_version = "1.10.2" manifest_format = "2.0" -project_hash = "4f6774eaeaec568c8cb7a865a84c0c7d9a180727" - -[[deps.Automa]] -deps = ["PrecompileTools", "TranscodingStreams"] -git-tree-sha1 = "588e0d680ad1d7201d4c6a804dcb1cd9cba79fbb" -uuid = "67c07d97-cdcb-5c2c-af73-a7f9c32a568b" -version = "1.0.3" - -[[deps.BioGenerics]] -deps = ["TranscodingStreams"] -git-tree-sha1 = "7bbc085aebc6faa615740b63756e4986c9e85a70" -uuid = "47718e42-2ac5-11e9-14af-e5595289c2ea" -version = "0.1.4" +project_hash = "0426921543f1f35e1faa99ad2688f4a61508977b" [[deps.BioSequences]] deps = ["BioSymbols", "PrecompileTools", "Random", "Twiddle"] @@ -32,16 +20,6 @@ version = "5.1.3" deps = ["Printf"] uuid = "ade2ca70-3891-5945-98fb-dc099432e06a" -[[deps.FASTX]] -deps = ["Automa", "BioGenerics", "PrecompileTools", "StringViews", "TranscodingStreams"] -git-tree-sha1 = "bff5d62bf5e1c382a370ac701bcaea9a24115ac6" -uuid = "c2308a5c-f048-11e8-3e8a-31650f418d12" -version = "2.1.4" -weakdeps = ["BioSequences"] - - [deps.FASTX.extensions] - BioSequencesExt = "BioSequences" - [[deps.IterTools]] git-tree-sha1 = "42d5f897009e7ff2cf88db414a389e5ed1bdd023" uuid = "c8e1da08-722c-5040-9ed9-7db0dc04731e" @@ -71,28 +49,11 @@ uuid = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" uuid = "ea8e919c-243c-51af-8825-aaa63cd721ce" version = "0.7.0" -[[deps.StringViews]] -git-tree-sha1 = "f7b06677eae2571c888fd686ba88047d8738b0e3" -uuid = "354b36f9-a18e-4713-926e-db85100087ba" -version = "1.3.3" - [[deps.TOML]] deps = ["Dates"] uuid = "fa267f1f-6049-4f14-aa54-33bafae1ed76" version = "1.0.3" -[[deps.TranscodingStreams]] -git-tree-sha1 = "71509f04d045ec714c4748c785a59045c3736349" -uuid = "3bb67fe8-82b1-5028-8e26-92a6c54297fa" -version = "0.10.7" - - [deps.TranscodingStreams.extensions] - TestExt = ["Test", "Random"] - - [deps.TranscodingStreams.weakdeps] - Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" - Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" - [[deps.Twiddle]] git-tree-sha1 = "29509c4862bfb5da9e76eb6937125ab93986270a" uuid = "7200193e-83a8-5a55-b20d-5d36d44a0795" diff --git a/Project.toml b/Project.toml index 093d413..9dec4f7 100644 --- a/Project.toml +++ b/Project.toml @@ -5,13 +5,11 @@ version = "0.2.0" [deps] BioSequences = "7e6ae17a-c86d-528c-b3b9-7f778a29fe59" -FASTX = "c2308a5c-f048-11e8-3e8a-31650f418d12" IterTools = "c8e1da08-722c-5040-9ed9-7db0dc04731e" PrecompileTools = "aea7be01-6a6a-4083-8856-8a6e6704d82a" [compat] BioSequences = "3" -FASTX = "2" IterTools = "1.4" PrecompileTools = "1" julia = "1" diff --git a/src/GeneFinder.jl b/src/GeneFinder.jl index 01959a5..fbbbd74 100644 --- a/src/GeneFinder.jl +++ b/src/GeneFinder.jl @@ -20,7 +20,7 @@ using BioSequences: ncbi_trans_table, translate -using FASTX: FASTAReader, FASTARecord, description, sequence +# using FASTX: FASTAReader, FASTARecord, description, sequence using IterTools: takewhile, iterated using PrecompileTools: @setup_workload, @compile_workload diff --git a/src/getorfs.jl b/src/getorfs.jl index 46887c2..fed466b 100644 --- a/src/getorfs.jl +++ b/src/getorfs.jl @@ -1,4 +1,4 @@ -export get_orfs_dna, get_orfs_aa, record_orfs_fna, record_orfs_faa +export get_orfs_dna, get_orfs_aa #### get_orfs_* methods #### @@ -77,23 +77,23 @@ An array of `FASTARecord` objects representing the identified ORFs. # Description This function searches for Open Reading Frames (ORFs) in a given nucleic acid sequence. An ORF is a sequence of DNA that starts with a start codon and ends with a stop codon, without any other stop codons in between. By default, only the standard start codon (ATG) is considered, but if `alternative_start` is set to `true`, alternative start codons are also considered. The minimum length of an ORF to be recorded can be specified using the `min_len` argument. """ -function record_orfs_fna( - sequence::NucleicSeqOrView{DNAAlphabet{N}}; - alternative_start::Bool = false, - min_len::Int64 = 6 -) where {N} - orfs = findorfs(sequence; alternative_start, min_len) - norfs = length(orfs) - padding = norfs < 10 ? length(string(norfs)) + 1 : length(string(norfs)) - records = FASTARecord[] - @inbounds for (index, orf) in enumerate(orfs) - id = string(lpad(string(index), padding, "0")) - header = "ORF$(id) id=$(id) start=$(orf.location.start) stop=$(orf.location.stop) strand=$(orf.strand) frame=$(orf.frame)" - record = FASTARecord(header, sequence[orf]) - push!(records, record) - end - return records -end +# function record_orfs_fna( +# sequence::NucleicSeqOrView{DNAAlphabet{N}}; +# alternative_start::Bool = false, +# min_len::Int64 = 6 +# ) where {N} +# orfs = findorfs(sequence; alternative_start, min_len) +# norfs = length(orfs) +# padding = norfs < 10 ? length(string(norfs)) + 1 : length(string(norfs)) +# records = FASTARecord[] +# @inbounds for (index, orf) in enumerate(orfs) +# id = string(lpad(string(index), padding, "0")) +# header = "ORF$(id) id=$(id) start=$(orf.location.start) stop=$(orf.location.stop) strand=$(orf.strand) frame=$(orf.frame)" +# record = FASTARecord(header, sequence[orf]) +# push!(records, record) +# end +# return records +# end """ record_orfs_faa(sequence::NucleicSeqOrView{DNAAlphabet{N}}; kwargs...) where {N} @@ -111,21 +111,21 @@ The function returns a list of FASTA records, where each record represents an OR # Returns - A list of FASTA records representing the ORFs found in the sequence. """ -function record_orfs_faa( - sequence::NucleicSeqOrView{DNAAlphabet{N}}; - alternative_start::Bool = false, - code::GeneticCode = ncbi_trans_table[1], - min_len::Int64 = 6 -) where {N} - orfs = findorfs(sequence; alternative_start, min_len) - norfs = length(orfs) - padding = norfs < 10 ? length(string(norfs)) + 1 : length(string(norfs)) - records = FASTARecord[] - @inbounds for (index, orf) in enumerate(orfs) - id = string(lpad(string(index), padding, "0")) - header = "ORF$(id) id=$(id) start=$(orf.location.start) stop=$(orf.location.stop) strand=$(orf.strand) frame=$(orf.frame)" - record = FASTARecord(header, translate(sequence[orf]; code)) - push!(records, record) - end - return records -end \ No newline at end of file +# function record_orfs_faa( +# sequence::NucleicSeqOrView{DNAAlphabet{N}}; +# alternative_start::Bool = false, +# code::GeneticCode = ncbi_trans_table[1], +# min_len::Int64 = 6 +# ) where {N} +# orfs = findorfs(sequence; alternative_start, min_len) +# norfs = length(orfs) +# padding = norfs < 10 ? length(string(norfs)) + 1 : length(string(norfs)) +# records = FASTARecord[] +# @inbounds for (index, orf) in enumerate(orfs) +# id = string(lpad(string(index), padding, "0")) +# header = "ORF$(id) id=$(id) start=$(orf.location.start) stop=$(orf.location.stop) strand=$(orf.strand) frame=$(orf.frame)" +# record = FASTARecord(header, translate(sequence[orf]; code)) +# push!(records, record) +# end +# return records +# end \ No newline at end of file diff --git a/src/utils.jl b/src/utils.jl index ea57668..2571723 100644 --- a/src/utils.jl +++ b/src/utils.jl @@ -1,15 +1,15 @@ -export fasta_to_dna, hasprematurestop +# export fasta_to_dna, hasprematurestop # General purposes methods supporting main functions -""" - fasta_to_dna(input::String) - -Converts a FASTA formatted file (even if it is a multi-fasta) to an array of `LongSequence{DNAAlphabet{4}}` objects. -""" -function fasta_to_dna(input::AbstractString)::Vector{LongSequence{DNAAlphabet{4}}} - FASTAReader(open(input)) do reader - return [LongSequence{DNAAlphabet{4}}(sequence(record)) for record in reader] - end -end +# """ +# fasta_to_dna(input::String) + +# Converts a FASTA formatted file (even if it is a multi-fasta) to an array of `LongSequence{DNAAlphabet{4}}` objects. +# """ +# function fasta_to_dna(input::AbstractString)::Vector{LongSequence{DNAAlphabet{4}}} +# FASTAReader(open(input)) do reader +# return [LongSequence{DNAAlphabet{4}}(sequence(record)) for record in reader] +# end +# end # function gff_to_dna(input::AbstractString) # GFF3.Reader(open(input)) do reader @@ -24,23 +24,23 @@ Determine whether the `sequence` of type `LongSequence{DNAAlphabet{4}}` contains Returns a boolean indicating whether the `sequence` has more than one stop codon. """ -function hasprematurestop(sequence::NucleicSeqOrView{DNAAlphabet{N}})::Bool where {N} +# function hasprematurestop(sequence::NucleicSeqOrView{DNAAlphabet{N}})::Bool where {N} - stopcodons = [LongDNA{4}("TAA"), LongDNA{4}("TAG"), LongDNA{4}("TGA")] # Create a set of stop codons +# stopcodons = [LongDNA{4}("TAA"), LongDNA{4}("TAG"), LongDNA{4}("TGA")] # Create a set of stop codons - length(sequence) % 3 == 0 || error("The sequence is not divisible by 3") +# length(sequence) % 3 == 0 || error("The sequence is not divisible by 3") - occursin(biore"T(AG|AA|GA)"dna, sequence[end-2:end]) || error("There is no stop codon at the end of the sequence") +# occursin(biore"T(AG|AA|GA)"dna, sequence[end-2:end]) || error("There is no stop codon at the end of the sequence") - @inbounds for i in 1:3:length(sequence) - 4 - codon = sequence[i:i+2] - if codon in stopcodons - return true - end - end +# @inbounds for i in 1:3:length(sequence) - 4 +# codon = sequence[i:i+2] +# if codon in stopcodons +# return true +# end +# end - return false -end +# return false +# end @doc raw""" iscoding( diff --git a/test/runtests.jl b/test/runtests.jl index d89a385..899fe24 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -6,6 +6,17 @@ using FASTX using GeneFinder using Aqua +""" + fasta_to_dna(input::String) + +Converts a FASTA formatted file (even if it is a multi-fasta) to an array of `LongSequence{DNAAlphabet{4}}` objects. +""" +function fasta_to_dna(input::AbstractString)::Vector{LongSequence{DNAAlphabet{4}}} + FASTAReader(open(input)) do reader + return [LongSequence{DNAAlphabet{4}}(sequence(record)) for record in reader] + end +end + include("findorfstest.jl") include("iotest.jl") include("getindextest.jl") From 065c61d4c12356ec8d567f587360206a5b4bddf6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Camilo=20Garc=C3=ADa?= Date: Fri, 29 Mar 2024 17:55:25 -0500 Subject: [PATCH 13/17] Reduce precompilation workload --- src/GeneFinder.jl | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/GeneFinder.jl b/src/GeneFinder.jl index fbbbd74..dc08350 100644 --- a/src/GeneFinder.jl +++ b/src/GeneFinder.jl @@ -40,8 +40,6 @@ include("extended.jl") # all calls in this block will be precompiled, regardless of whether # they belong to your package or not (on Julia 1.8 and higher) findorfs(seq) - get_orfs_dna(seq) - get_orfs_aa(seq) end end From 2fdae66c47d1a9ef9457981bdf80ed2e02fde5e5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Camilo=20Garc=C3=ADa?= Date: Fri, 29 Mar 2024 18:07:08 -0500 Subject: [PATCH 14/17] Remove commented out code and unused function --- src/utils.jl | 16 ---------------- 1 file changed, 16 deletions(-) diff --git a/src/utils.jl b/src/utils.jl index 2571723..06f1a49 100644 --- a/src/utils.jl +++ b/src/utils.jl @@ -1,21 +1,5 @@ # export fasta_to_dna, hasprematurestop # General purposes methods supporting main functions -# """ -# fasta_to_dna(input::String) - -# Converts a FASTA formatted file (even if it is a multi-fasta) to an array of `LongSequence{DNAAlphabet{4}}` objects. -# """ -# function fasta_to_dna(input::AbstractString)::Vector{LongSequence{DNAAlphabet{4}}} -# FASTAReader(open(input)) do reader -# return [LongSequence{DNAAlphabet{4}}(sequence(record)) for record in reader] -# end -# end - -# function gff_to_dna(input::AbstractString) -# GFF3.Reader(open(input)) do reader -# return [record for record in reader] -# end -# end """ hasprematurestop(sequence::LongNucOrView{4})::Bool From 2b5c8fc085b646177309bea673ccff9e83a47b66 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Camilo=20Garc=C3=ADa?= Date: Fri, 29 Mar 2024 18:07:48 -0500 Subject: [PATCH 15/17] Remove unused function from utils.jl --- src/utils.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/utils.jl b/src/utils.jl index 06f1a49..a61831e 100644 --- a/src/utils.jl +++ b/src/utils.jl @@ -1,4 +1,4 @@ -# export fasta_to_dna, hasprematurestop +# export hasprematurestop # General purposes methods supporting main functions """ From 80a8918ba810bce2c1d6a74876b1fdd50aba4299 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Camilo=20Garc=C3=ADa?= Date: Fri, 29 Mar 2024 18:52:49 -0500 Subject: [PATCH 16/17] Fix score field declaration in Gene and ORF structs --- src/types.jl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/types.jl b/src/types.jl index bda70f6..12d9c7e 100644 --- a/src/types.jl +++ b/src/types.jl @@ -10,7 +10,7 @@ abstract type AbstractGene end stop::Int64 strand::Char frame::Int8 - score::Union{Float64, Nothing} # Add score field + score::Union{Nothing, Float64} # Add score field attribute::Dict end @@ -39,7 +39,7 @@ This is the main Gene struct, based on the fields that could be found in a GFF3, location::UnitRange{Int64} strand::Char frame::Int - score::Union{Float64, Nothing} # Add score field + score::Union{Nothing, Float64} end The ORF struct represents an open reading frame in a DNA sequence. It has two fields: From 92b0e22f64154170f0f262df48e17ca761f5b1ec Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Camilo=20Garc=C3=ADa?= Date: Fri, 29 Mar 2024 18:53:03 -0500 Subject: [PATCH 17/17] Add findermethod parameter to get_orfs_dna and get_orfs_aa functions --- src/getorfs.jl | 6 +- src/io.jl | 308 ++++++++++++++++++++++++++----------------------- 2 files changed, 165 insertions(+), 149 deletions(-) diff --git a/src/getorfs.jl b/src/getorfs.jl index fed466b..bb2aa19 100644 --- a/src/getorfs.jl +++ b/src/getorfs.jl @@ -19,10 +19,11 @@ This function takes a `NucleicSeqOrView{DNAAlphabet{N}}` sequence and identifies """ function get_orfs_dna( sequence::NucleicSeqOrView{DNAAlphabet{N}}; + findermethod::Function = naivefinder, alternative_start::Bool = false, min_len::Int64 = 6 ) where {N} - orfs = findorfs(sequence; alternative_start, min_len) + orfs = findorfs(sequence; findermethod, alternative_start, min_len) seqs = Vector{LongSubSeq{DNAAlphabet{N}}}(undef, length(orfs)) #Vector{}(undef, length(orfs)) # todo correct the output type @inbounds for (i, orf) in enumerate(orfs) seqs[i] = sequence[orf] @@ -47,11 +48,12 @@ This function takes a `NucleicSeqOrView{DNAAlphabet{N}}` sequence and identifies """ function get_orfs_aa( sequence::NucleicSeqOrView{DNAAlphabet{N}}; + findermethod::Function = naivefinder, alternative_start::Bool = false, code::GeneticCode = ncbi_trans_table[1], min_len::Int64 = 6 ) where {N} - orfs = findorfs(sequence; alternative_start, min_len) + orfs = findorfs(sequence; findermethod, alternative_start, min_len) aas = Vector{LongSubSeq{AminoAcidAlphabet}}(undef, length(orfs)) @inbounds for (i, orf) in enumerate(orfs) aas[i] = translate(sequence[orf]; code) diff --git a/src/io.jl b/src/io.jl index b5432c3..7240ffb 100644 --- a/src/io.jl +++ b/src/io.jl @@ -1,8 +1,8 @@ export write_orfs_fna, write_orfs_faa, write_orfs_bed, write_orfs_gff """ - write_orfs_bed(input::NucleicAcidAlphabet{DNAAlphabet{N}}, output::Union{IOStream, IOBuffer}; kwargs...) where {N} - write_orfs_bed(input::NucleicAcidAlphabet{DNAAlphabet{N}}, output::String; kwargs...) where {N} + write_orfs_bed(input::NucleicSeqOrView{DNAAlphabet{N}}, output::Union{IOStream, IOBuffer}; kwargs...) where {N} + write_orfs_bed(input::NucleicSeqOrView{DNAAlphabet{N}}, output::String; kwargs...) where {N} write_orfs_bed(input::String, output::Union{IOStream, IOBuffer}; kwargs...) write_orfs_bed(input::String, output::String; kwargs...) @@ -40,33 +40,35 @@ function write_orfs_bed( end end -function write_orfs_bed( - input::String, - output::Union{IOStream, IOBuffer}; - alternative_start::Bool = false, - min_len::Int64 = 6 -) - input = fasta_to_dna(input)[1] # rewrite input to be a DNA sequence - orfs = findorfs(input; alternative_start, min_len) - @inbounds for orf in orfs - println(output, orf.location.start, "\t", orf.location.stop, "\t", orf.strand, "\t", orf.frame) - end -end - -function write_orfs_bed( - input::String, - output::String; - alternative_start::Bool = false, - min_len::Int64 = 6 -) - input = fasta_to_dna(input)[1] - orfs = findorfs(input; alternative_start, min_len) - open(output, "w") do f - @inbounds for orf in orfs - write(f, "$(orf.location.start)\t$(orf.location.stop)\t$(orf.strand)\t$(orf.frame)\n") - end - end -end +# function write_orfs_bed( +# input::String, +# output::Union{IOStream, IOBuffer}; +# findermethod::Function = naivefinder, +# alternative_start::Bool = false, +# min_len::Int64 = 6 +# ) +# input = fasta_to_dna(input)[1] # rewrite input to be a DNA sequence +# orfs = findorfs(input; findermethod, alternative_start, min_len) +# @inbounds for orf in orfs +# println(output, orf.location.start, "\t", orf.location.stop, "\t", orf.strand, "\t", orf.frame) +# end +# end + +# function write_orfs_bed( +# input::String, +# output::String; +# findermethod::Function = naivefinder, +# alternative_start::Bool = false, +# min_len::Int64 = 6 +# ) +# input = fasta_to_dna(input)[1] +# orfs = findorfs(input; findermethod, alternative_start, min_len) +# open(output, "w") do f +# @inbounds for orf in orfs +# write(f, "$(orf.location.start)\t$(orf.location.stop)\t$(orf.strand)\t$(orf.frame)\n") +# end +# end +# end """ write_orfs_fna(input::NucleicSeqOrView{DNAAlphabet{N}}, output::Union{IOStream, IOBuffer}; kwargs...) where {N} @@ -95,10 +97,11 @@ end function write_orfs_fna( input::NucleicSeqOrView{DNAAlphabet{N}}, output::Union{IOStream, IOBuffer}; + findermethod::Function = naivefinder, alternative_start::Bool = false, min_len::Int64 = 6 ) where {N} - orfs = findorfs(input; alternative_start, min_len) + orfs = findorfs(input; findermethod, alternative_start, min_len) norfs = length(orfs) padding = norfs < 10 ? length(string(norfs)) + 1 : length(string(norfs)) @inbounds for (i, orf) in enumerate(orfs) @@ -110,11 +113,12 @@ end function write_orfs_fna( input::NucleicSeqOrView{DNAAlphabet{N}}, - output::String; + output::String; + findermethod::Function = naivefinder, alternative_start::Bool = false, min_len::Int64 = 6 ) where {N} - orfs = findorfs(input; alternative_start, min_len) + orfs = findorfs(input; findermethod, alternative_start, min_len) norfs = length(orfs) padding = norfs < 10 ? length(string(norfs)) + 1 : length(string(norfs)) open(output, "w") do f @@ -125,44 +129,46 @@ function write_orfs_fna( end end -function write_orfs_fna( - input::String, - output::Union{IOStream, IOBuffer}; - alternative_start::Bool = false, - min_len::Int64 = 6 -) - input = fasta_to_dna(input)[1] - orfs = findorfs(input; alternative_start, min_len) - norfs = length(orfs) - padding = norfs < 10 ? length(string(norfs)) + 1 : length(string(norfs)) - @inbounds for (i, orf) in enumerate(orfs) - id = string(lpad(string(i), padding, "0")) - println(output, ">ORF", id, " id=", id, " start=", orf.location.start, " stop=", orf.location.stop, " strand=", orf.strand, " frame=", orf.frame) - println(output, input[orf]) - end -end - -function write_orfs_fna( - input::String, - output::String; - alternative_start::Bool = false, - min_len::Int64 = 6 -) - input = fasta_to_dna(input)[1] - orfs = findorfs(input; alternative_start, min_len) - norfs = length(orfs) - padding = norfs < 10 ? length(string(norfs)) + 1 : length(string(norfs)) - open(output, "w") do f - for (i, orf) in enumerate(orfs) - id = string(lpad(string(i), padding, "0")) - write(f, ">ORF$(id) id=$(id) start=$(orf.location.start) stop=$(orf.location.stop) strand=$(orf.strand) frame=$(orf.frame)\n$(input[orf])\n") # i.strand == '+' ? input[i.location] : reverse_complement(@view input[i.location]) - end - end -end +# function write_orfs_fna( +# input::String, +# output::Union{IOStream, IOBuffer}; +# findermethod::Function = naivefinder, +# alternative_start::Bool = false, +# min_len::Int64 = 6 +# ) +# input = fasta_to_dna(input)[1] +# orfs = findorfs(input; findermethod, alternative_start, min_len) +# norfs = length(orfs) +# padding = norfs < 10 ? length(string(norfs)) + 1 : length(string(norfs)) +# @inbounds for (i, orf) in enumerate(orfs) +# id = string(lpad(string(i), padding, "0")) +# println(output, ">ORF", id, " id=", id, " start=", orf.location.start, " stop=", orf.location.stop, " strand=", orf.strand, " frame=", orf.frame) +# println(output, input[orf]) +# end +# end + +# function write_orfs_fna( +# input::String, +# output::String; +# findermethod::Function = naivefinder, +# alternative_start::Bool = false, +# min_len::Int64 = 6 +# ) +# input = fasta_to_dna(input)[1] +# orfs = findorfs(input; findermethod, alternative_start, min_len) +# norfs = length(orfs) +# padding = norfs < 10 ? length(string(norfs)) + 1 : length(string(norfs)) +# open(output, "w") do f +# for (i, orf) in enumerate(orfs) +# id = string(lpad(string(i), padding, "0")) +# write(f, ">ORF$(id) id=$(id) start=$(orf.location.start) stop=$(orf.location.stop) strand=$(orf.strand) frame=$(orf.frame)\n$(input[orf])\n") # i.strand == '+' ? input[i.location] : reverse_complement(@view input[i.location]) +# end +# end +# end """ - write_orfs_faa(input::LongSequence{DNAAlphabet{4}}, output::Union{IOStream, IOBuffer}; kwargs...) where {N} - write_orfs_faa(input::LongSequence{DNAAlphabet{4}}, output::String; kwargs...) where {N} + write_orfs_faa(input::NucleicSeqOrView{DNAAlphabet{4}}, output::Union{IOStream, IOBuffer}; kwargs...) where {N} + write_orfs_faa(input::NucleicSeqOrView{DNAAlphabet{4}}, output::String; kwargs...) where {N} write_orfs_faa(input::String, output::Union{IOStream, IOBuffer}; kwargs...) write_orfs_faa(input::String, output::String; kwargs...) @@ -188,11 +194,12 @@ end function write_orfs_faa( input::NucleicSeqOrView{DNAAlphabet{N}}, output::Union{IOStream, IOBuffer}; + findermethod::Function = naivefinder, alternative_start::Bool = false, code::GeneticCode = ncbi_trans_table[1], min_len::Int64 = 6 ) where {N} - orfs = findorfs(input; alternative_start, min_len) + orfs = findorfs(input; findermethod, alternative_start, min_len) norfs = length(orfs) padding = norfs < 10 ? length(string(norfs)) + 1 : length(string(norfs)) @inbounds for (i, orf) in enumerate(orfs) @@ -205,11 +212,12 @@ end function write_orfs_faa( input::NucleicSeqOrView{DNAAlphabet{N}}, output::String; + findermethod::Function = naivefinder, alternative_start::Bool = false, code::GeneticCode = ncbi_trans_table[1], min_len::Int64 = 6, ) where {N} - orfs = findorfs(input; alternative_start, min_len) + orfs = findorfs(input; findermethod, alternative_start, min_len) norfs = length(orfs) padding = norfs < 10 ? length(string(norfs)) + 1 : length(string(norfs)) open(output, "w") do f @@ -220,42 +228,44 @@ function write_orfs_faa( end end -function write_orfs_faa( - input::String, - output::Union{IOStream, IOBuffer}; - alternative_start::Bool = false, - code::GeneticCode = ncbi_trans_table[1], - min_len::Int64 = 6 -) - input = fasta_to_dna(input)[1] - orfs = findorfs(input; alternative_start, min_len) - norfs = length(orfs) - padding = norfs < 10 ? length(string(norfs)) + 1 : length(string(norfs)) - @inbounds for (i, orf) in enumerate(orfs) - id = string(lpad(string(i), padding, "0")) - println(output, ">ORF", id, " id=", id, " start=", orf.location.start, " stop=", orf.location.stop, " strand=", orf.strand, " frame=", orf.frame) - println(output, translate(input[orf]; code)) - end -end - -function write_orfs_faa( - input::String, - output::String; - alternative_start::Bool = false, - code::GeneticCode = ncbi_trans_table[1], - min_len::Int64 = 6, -) - input = fasta_to_dna(input)[1] - orfs = findorfs(input; alternative_start, min_len) - norfs = length(orfs) - padding = norfs < 10 ? length(string(norfs)) + 1 : length(string(norfs)) - open(output, "w") do f - for (i, orf) in enumerate(orfs) - id = string(lpad(string(i), padding, "0")) - write(f, ">ORF$(id) id=$(id) start=$(orf.location.start) stop=$(orf.location.stop) strand=$(orf.strand) frame=$(orf.frame)\n$(translate(input[orf]; code))\n") - end - end -end +# function write_orfs_faa( +# input::String, +# output::Union{IOStream, IOBuffer}; +# findermethod::Function = naivefinder, +# alternative_start::Bool = false, +# code::GeneticCode = ncbi_trans_table[1], +# min_len::Int64 = 6 +# ) +# input = fasta_to_dna(input)[1] +# orfs = findorfs(input; findermethod, alternative_start, min_len) +# norfs = length(orfs) +# padding = norfs < 10 ? length(string(norfs)) + 1 : length(string(norfs)) +# @inbounds for (i, orf) in enumerate(orfs) +# id = string(lpad(string(i), padding, "0")) +# println(output, ">ORF", id, " id=", id, " start=", orf.location.start, " stop=", orf.location.stop, " strand=", orf.strand, " frame=", orf.frame) +# println(output, translate(input[orf]; code)) +# end +# end + +# function write_orfs_faa( +# input::String, +# output::String; +# findermethod::Function = naivefinder, +# alternative_start::Bool = false, +# code::GeneticCode = ncbi_trans_table[1], +# min_len::Int64 = 6, +# ) +# input = fasta_to_dna(input)[1] +# orfs = findorfs(input; findermethod, alternative_start, min_len) +# norfs = length(orfs) +# padding = norfs < 10 ? length(string(norfs)) + 1 : length(string(norfs)) +# open(output, "w") do f +# for (i, orf) in enumerate(orfs) +# id = string(lpad(string(i), padding, "0")) +# write(f, ">ORF$(id) id=$(id) start=$(orf.location.start) stop=$(orf.location.stop) strand=$(orf.strand) frame=$(orf.frame)\n$(translate(input[orf]; code))\n") +# end +# end +# end """ write_orfs_gff(input::NucleicSeqOrView{DNAAlphabet{N}}, output::Union{IOStream, IOBuffer}; kwargs...) where {N} @@ -274,11 +284,12 @@ Write GFF data to a file. """ function write_orfs_gff( input::NucleicSeqOrView{A}, - output::Union{IOStream, IOBuffer}; + output::Union{IOStream, IOBuffer}; + findermethod::Function = naivefinder, alternative_start::Bool = false, min_len::Int64 = 6 ) where {A} - orfs = findorfs(input; alternative_start, min_len) + orfs = findorfs(input; findermethod, alternative_start, min_len) norfs = length(orfs) padding = norfs < 10 ? length(string(norfs)) + 1 : length(string(norfs)) println(output, "##gff-version 3\n##sequence-region Chr 1 $(length(input))") @@ -291,10 +302,11 @@ end function write_orfs_gff( input::NucleicSeqOrView{A}, output::String; + findermethod::Function = naivefinder, alternative_start::Bool = false, min_len::Int64 = 6 ) where {A} - orfs = findorfs(input; alternative_start, min_len) + orfs = findorfs(input; findermethod, alternative_start, min_len) norfs = length(orfs) padding = norfs < 10 ? length(string(norfs)) + 1 : length(string(norfs)) open(output, "w") do f @@ -309,43 +321,45 @@ function write_orfs_gff( end end -function write_orfs_gff( - input::String, - output::Union{IOStream, IOBuffer}; - alternative_start::Bool = false, - min_len::Int64 = 6 -) - input = fasta_to_dna(input)[1] - orfs = findorfs(input; alternative_start, min_len) - norfs = length(orfs) - padding = norfs < 10 ? length(string(norfs)) + 1 : length(string(norfs)) - println(output, "##gff-version 3\n##sequence-region Chr 1 $(length(input))") - for (i, orf) in enumerate(orfs) - id = string("ORF", lpad(string(i), padding, "0")) - println(output, "Chr\t.\tORF\t", orf.location.start, "\t", orf.location.stop, "\t.\t", orf.strand, "\t.\tID=", id, ";Name=", id, ";Frame=", orf.frame) - end -end - -function write_orfs_gff( - input::String, - output::String; - alternative_start::Bool = false, - min_len::Int64 = 6 -) - input = fasta_to_dna(input)[1] - orfs = findorfs(input; alternative_start, min_len) - norfs = length(orfs) - padding = norfs < 10 ? length(string(norfs)) + 1 : length(string(norfs)) - open(output, "w") do f - write(f, "##gff-version 3\n##sequence-region Chr 1 $(length(input))\n") - for (i, orf) in enumerate(orfs) - id = string("ORF", lpad(string(i), padding, "0")) - write( - f, - "Chr\t.\tORF\t$(orf.location.start)\t$(orf.location.stop)\t.\t$(orf.strand)\t.\tID=$(id);Name=$(id);Frame=$(orf.frame)\n", - ) - end - end -end +# function write_orfs_gff( +# input::String, +# output::Union{IOStream, IOBuffer}; +# findermethod::Function = naivefinder, +# alternative_start::Bool = false, +# min_len::Int64 = 6 +# ) +# input = fasta_to_dna(input)[1] +# orfs = findorfs(input; findermethod, alternative_start, min_len) +# norfs = length(orfs) +# padding = norfs < 10 ? length(string(norfs)) + 1 : length(string(norfs)) +# println(output, "##gff-version 3\n##sequence-region Chr 1 $(length(input))") +# for (i, orf) in enumerate(orfs) +# id = string("ORF", lpad(string(i), padding, "0")) +# println(output, "Chr\t.\tORF\t", orf.location.start, "\t", orf.location.stop, "\t.\t", orf.strand, "\t.\tID=", id, ";Name=", id, ";Frame=", orf.frame) +# end +# end + +# function write_orfs_gff( +# input::String, +# output::String; +# findermethod::Function = naivefinder, +# alternative_start::Bool = false, +# min_len::Int64 = 6 +# ) +# input = fasta_to_dna(input)[1] +# orfs = findorfs(input; findermethod, alternative_start, min_len) +# norfs = length(orfs) +# padding = norfs < 10 ? length(string(norfs)) + 1 : length(string(norfs)) +# open(output, "w") do f +# write(f, "##gff-version 3\n##sequence-region Chr 1 $(length(input))\n") +# for (i, orf) in enumerate(orfs) +# id = string("ORF", lpad(string(i), padding, "0")) +# write( +# f, +# "Chr\t.\tORF\t$(orf.location.start)\t$(orf.location.stop)\t.\t$(orf.strand)\t.\tID=$(id);Name=$(id);Frame=$(orf.frame)\n", +# ) +# end +# end +# end # Chr needs to be changed for fasta (?) or the name of seq (?) to fit well on IGV \ No newline at end of file