Skip to content

Commit

Permalink
Merge pull request #6 from camilogarciabotero:docs
Browse files Browse the repository at this point in the history
Update docs
  • Loading branch information
camilogarciabotero authored Feb 12, 2024
2 parents 0a953f0 + 3f9fd36 commit 9e60cd7
Show file tree
Hide file tree
Showing 2 changed files with 49 additions and 99 deletions.
12 changes: 9 additions & 3 deletions docs/src/vossrepresentation.md
Original file line number Diff line number Diff line change
@@ -1,9 +1,15 @@
## Voss representation

A Voss representation of a biological sequence is a binary matrix that encodes the sequence. The Voss representation of a sequence is obtained by encoding the sequence into a binary matrix where each column of the matrix represents a position in the sequence and each row represents a symbol in the alphabet (Voss, 1992). Formally, given a sequence ``S`` of length ``n`` and an alphabet ``\mathscr{A}`` of size ``m``, the Voss matrix of ``S`` is a ``m \times n`` binary matrix ``M`` such that ``M_{i,j} = 1`` if the ``j^{th}`` position of the sequence ``S`` contains the ``i^{th}`` symbol of the alphabet ``\mathscr{A}`` and ``M_{i,j} = 0`` otherwise.
A Voss representation of a biological sequence is a binary matrix that encodes the sequence. The Voss representation of a sequence is obtained by encoding the sequence into a binary matrix where each column of the matrix represents a position in the sequence and each row represents a symbol in the alphabet (Voss, 1992). Formally, given a sequence ``S`` of length ``n`` and an alphabet ``\mathscr{A}`` of size ``m``, the Voss matrix ``V`` of ``S`` is a ``m \times n`` binary matrix ``V`` such that ``V_{i,j} = 1`` if the ``j^{th}`` position of the sequence ``S`` is equal to the ``i^{th}`` symbol of the alphabet ``\mathscr{A}`` and ``V_{i,j} = 0`` otherwise:

```math
v_i[j] = \begin{cases}
1 & \text{if } s[j] = \mathscr{a}[i] \\
0 & \text{if } s[j] \neq \mathscr{a}[i]
\end{cases}
```

For example, the Voss matrix of the sequence ``ACGT`` is the following matrix:
For example, the Voss matrix of the DNA sequence (i.e of ``\mathscr{A}) == \{A, C, G, T\}``) is the following matrix:

```math
\begin{bmatrix}
Expand All @@ -14,7 +20,7 @@ For example, the Voss matrix of the sequence ``ACGT`` is the following matrix:
\end{bmatrix}
```

In this case the alphabet chosen is the DNA alphabet, but the same representation can be used for other alphabets.
In this case the given alphabet is the DNA alphabet, but the same representation can be used for other alphabets.

## Encoding BioSequences

Expand Down
136 changes: 40 additions & 96 deletions src/utils.jl
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,18 @@ Converts a sequence of nucleotides into a binary representation.
# Returns
A `BitVector` representing the binary encoding of the input sequence, where 1 indicates the presence of the specified nucleotide and 0 indicates the absence in the ith position of the sequence.
# Examples
```julia
julia> vossvector(dna"ACGT", DNA_A)
4-element view(::BitMatrix, 1, :) with eltype Bool:
1
0
0
0
```
"""
function vossvector(sequence::NucleicSeqOrView{A}, molecule::T) where {A <: NucleicAcidAlphabet, T <: BioSymbol} # $dseq .=== DNA_A
@assert typeof(molecule) == eltype(sequence) "Input sequence and molecules must be of the same element type."
Expand Down Expand Up @@ -42,54 +54,6 @@ function vossvector(sequence::SeqOrView{A}, molecules::Tuple{Vararg{T}}) where {
return bv
end

# function vossvector(sequence::SeqOrView{A}, molecule::T) where {A <: AminoAcidAlphabet, T <: BioSymbol} # $dseq .=== DNA_A
# @assert typeof(molecule) == eltype(sequence) "Input sequence and molecules must be of the same element type."
# bm = BitMatrix(undef, 20, length(sequence))
# copy!(bm.chunks, sequence.data)
# if molecule in AA20
# return @view bm[findfirst(x -> x == molecule, AA20), :]
# else
# error("Unsupported molecule type.")
# end
# end

# function vossvector(sequence::SeqOrView{A}, molecule::T) where {A <: Alphabet, T <: BioSymbol} # $dseq .=== DNA_A
# @assert typeof(molecule) == eltype(sequence) "Input sequence and molecules must be of the same element type."
# seqlen = length(sequence)
# bv = BitVector(undef, seqlen)
# # acc = zero(UInt64)
# # bv .= (sequence .== molecule)
# @inbounds for i in 1:seqlen
# bv[i] = (sequence[i] == molecule)
# # acc = (acc << 1) | (sequence[i] == molecule)
# end
# return bv
# end

# function vossvector(s::SeqOrView{A}) where {A <: NucleicAcidAlphabet}
# M = BitMatrix(undef, 4, length(s))
# copy!(M.chunks, s.data)
# return M
# end

# function vossvector(s::SeqOrView{A}) where {A <: AminoAcidAlphabet}
# M = BitMatrix(undef, 20, length(s))
# copy!(M.chunks, s.data)
# return M
# end

# function vossvector(s::NucSeq)
# M = falses(4, length(s))
# for (i, s) in enumerate(s)
# bits = compatbits(s)
# while !iszero(bits)
# M[trailing_zeros(bits) + 1, i] = true
# bits &= bits - one(bits)
# end
# end
# M
# end

"""
vossmatrix(VossEncoder::VossEncoder{A, B}) where {A <: NucleicAcidAlphabet, B <: BitMatrix}
vossmatrix(sequence::NucleicSeqOrView{A}) where {A <: NucleicAcidAlphabet}
Expand All @@ -103,6 +67,33 @@ Create a binary sequence matrix from a given nucleic acid sequence.
# Returns
The binary sequence matrix.
# Examples
```julia
julia> vossmatrix(aa"IANRMWRDTIED")
20×12 BitMatrix:
0 1 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 1 0 0 0 1
0 0 0 0 0 0 0 0 0 0 1 0
0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0
1 0 0 0 0 0 0 0 0 1 0 0
0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 1 0 0 0 0 0 0 0
0 0 1 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 1 0 0 1 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 1 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 1 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0
```
"""
function vossmatrix(VE::VossEncoder{A, B}) where {A <: NucleicAcidAlphabet, B <: BitMatrix}
return VossEncoder(VE).bitmatrix
Expand All @@ -120,51 +111,4 @@ function vossmatrix(sequence::SeqOrView{AminoAcidAlphabet})
bm[i,:] = sequence .== AA20[i]
end
return bm
end

# function vossmatrix(sequence::SeqOrView{AminoAcidAlphabet})
# bm = BitMatrix(undef, 20, length(sequence))
# copy!(bm.chunks, sequence.data)
# return bm
# end

# function vossmatrix(sequence::SeqOrView{A}) where {A <: NucleicAcidAlphabet}
# seqtype = eltype(sequence)
# seqlen = length(sequence)
# VE = BitMatrix(undef, 4, seqlen)
# @inbounds for i in 1:seqlen
# nucleotide = sequence[i]
# VE[1, i] = (nucleotide == DNA_A)
# VE[2, i] = (nucleotide == DNA_C)
# VE[3, i] = (nucleotide == DNA_G)
# VE[4, i] = seqtype == DNA ? (nucleotide == DNA_T) : seqtype == RNA ? (nucleotide == RNA_U) : error("Unsupported sequence type")
# end
# return VE
# end

# function vossmatrix(sequence::SeqOrView{AminoAcidAlphabet})
# seqlen = length(sequence)
# VE = BitMatrix(undef, 20, length(sequence))
# @inbounds for i in i:seqlen
# aminoacid = sequence[i]
# # aminoacid = sequence[i] in AA20 ? sequence[i] : error("Unknown amino acid in position $(findall(sequence[i], sequence)).")
# # aminoacid ∉ AA20 error("Unknown amino acid in position $(findall(aminoacid, sequence)).")
# @assert aminoacid != AA_Term "Stop codons in position $(findall(AA_Term, sequence)). Stop codons are not supported."
# @assert aminoacid != AA_Gap "Gap in position $(findall(AA_Gap, sequence)). Gaps are not supported."
# @inbounds for j in 1:20
# VE[j, i] = (aminoacid == AA20[j])
# end
# end
# return VE
# end

# function vossvectormatrix(sequence::SeqOrView{A}) where {A <: Alphabet}
# seqtype = eltype(sequence)
# if seqtype == DNA || seqtype == RNA
# VE = BitMatrix(undef, 4, length(sequence))
# @inbounds for i in eachindex(ACGT)
# VE[i, :] = vossvector(sequence, ACGT[i])
# end
# end
# return VE
# end
end

0 comments on commit 9e60cd7

Please sign in to comment.