Skip to content

Commit

Permalink
Allow translation of LongSubSeq
Browse files Browse the repository at this point in the history
This also fixes an issue in the translation code, and adds another constructor
to LongSubSeq that I thought was missing.
  • Loading branch information
jakobnissen committed Sep 26, 2023
1 parent b4f5970 commit f64918e
Show file tree
Hide file tree
Showing 4 changed files with 90 additions and 38 deletions.
10 changes: 5 additions & 5 deletions src/geneticcode.jl
Original file line number Diff line number Diff line change
Expand Up @@ -329,18 +329,18 @@ result in an error. For organisms that utilize alternative start codons, one
can set `alternative_start=true`, in which case the first codon will always be
converted to a methionine.
"""
function translate(ntseq::LongNuc{N};
function translate(ntseq::SeqOrView;
code::GeneticCode = standard_genetic_code,
allow_ambiguous_codons::Bool = true,
alternative_start::Bool = false
) where N
)
len = div((length(ntseq) % UInt) * 11, 32)
translate!(LongAA(undef, len), ntseq; code = code,
allow_ambiguous_codons = allow_ambiguous_codons, alternative_start = alternative_start)
end

function translate!(aaseq::LongAA,
ntseq::LongNuc{2};#Sequence{<:Union{DNAAlphabet{2}, RNAAlphabet{2}}};
ntseq::SeqOrView{<:NucleicAcidAlphabet{2}};
code::GeneticCode = standard_genetic_code,
allow_ambiguous_codons::Bool = true,
alternative_start::Bool = false
Expand All @@ -360,7 +360,7 @@ function translate!(aaseq::LongAA,
end

function translate!(aaseq::LongAA,
ntseq::LongNuc{4};#Sequence{<:Union{DNAAlphabet{4}, RNAAlphabet{4}}};
ntseq::SeqOrView{<:NucleicAcidAlphabet{4}};
code::GeneticCode = standard_genetic_code,
allow_ambiguous_codons::Bool = true,
alternative_start::Bool = false
Expand Down Expand Up @@ -401,7 +401,7 @@ function try_translate_ambiguous_codon(
allow_ambiguous || error("codon ", a, b, c, " cannot be unambiguously translated")
aa = if aa_new in (AA_N, AA_D) && aa in (AA_N, AA_D, AA_B)
AA_B
elseif aa_new in (AA_I, AA_L) && aa in (AA_I, AA_L, AA_B)
elseif aa_new in (AA_I, AA_L) && aa in (AA_I, AA_L, AA_J)
AA_J
elseif aa_new in (AA_Q, AA_E) && aa in (AA_Q, AA_E, AA_Z)
AA_Z
Expand Down
6 changes: 6 additions & 0 deletions src/longsequences/seqview.jl
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,12 @@ function (::Type{T})(seq::LongSequence{<:NucleicAcidAlphabet{N}}) where
T(seq.data, 1:length(seq))
end

function (::Type{T})(seq::LongSequence{<:NucleicAcidAlphabet{N}}, part::AbstractUnitRange{<:Integer}) where
{N, T<:LongSubSeq{<:NucleicAcidAlphabet{N}}}
@boundscheck checkbounds(seq, part)
T(seq.data, UnitRange{Int}(part))
end

function Base.convert(::Type{T1}, seq::T2) where
{T1 <: Union{LongSequence, LongSubSeq}, T2 <: Union{LongSequence, LongSubSeq}}
return T1(seq)
Expand Down
3 changes: 3 additions & 0 deletions test/longsequences/seqview.jl
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,9 @@
@test vv == vv2 == vv3

@test LongSubSeq{AminoAcidAlphabet}(seq) == view(seq, eachindex(seq))

s = dna"TASCTAWTA"
@test collect(LongSubSeq{RNAAlphabet{4}}(s, 3:7)) == collect(LongRNA{4}(s)[3:7])
end

@testset "Basics" begin
Expand Down
109 changes: 76 additions & 33 deletions test/translation.jl
Original file line number Diff line number Diff line change
@@ -1,27 +1,57 @@
@testset "Translation" begin
# crummy string translation to test against
standard_genetic_code_dict = Dict{String,Char}(
"AAA" => 'K', "AAC" => 'N', "AAG" => 'K', "AAU" => 'N',
"ACA" => 'T', "ACC" => 'T', "ACG" => 'T', "ACU" => 'T',
"AGA" => 'R', "AGC" => 'S', "AGG" => 'R', "AGU" => 'S',
"AUA" => 'I', "AUC" => 'I', "AUG" => 'M', "AUU" => 'I',
"CAA" => 'Q', "CAC" => 'H', "CAG" => 'Q', "CAU" => 'H',
"CCA" => 'P', "CCC" => 'P', "CCG" => 'P', "CCU" => 'P',
"CGA" => 'R', "CGC" => 'R', "CGG" => 'R', "CGU" => 'R',
"CUA" => 'L', "CUC" => 'L', "CUG" => 'L', "CUU" => 'L',
"GAA" => 'E', "GAC" => 'D', "GAG" => 'E', "GAU" => 'D',
"GCA" => 'A', "GCC" => 'A', "GCG" => 'A', "GCU" => 'A',
"GGA" => 'G', "GGC" => 'G', "GGG" => 'G', "GGU" => 'G',
"GUA" => 'V', "GUC" => 'V', "GUG" => 'V', "GUU" => 'V',
"UAA" => '*', "UAC" => 'Y', "UAG" => '*', "UAU" => 'Y',
"UCA" => 'S', "UCC" => 'S', "UCG" => 'S', "UCU" => 'S',
"UGA" => '*', "UGC" => 'C', "UGG" => 'W', "UGU" => 'C',
"UUA" => 'L', "UUC" => 'F', "UUG" => 'L', "UUU" => 'F',
standard_genetic_code_dict = let
d_ = Dict{String,Char}(
"AAA" => 'K', "AAC" => 'N', "AAG" => 'K', "AAU" => 'N',
"ACA" => 'T', "ACC" => 'T', "ACG" => 'T', "ACU" => 'T',
"AGA" => 'R', "AGC" => 'S', "AGG" => 'R', "AGU" => 'S',
"AUA" => 'I', "AUC" => 'I', "AUG" => 'M', "AUU" => 'I',
"CAA" => 'Q', "CAC" => 'H', "CAG" => 'Q', "CAU" => 'H',
"CCA" => 'P', "CCC" => 'P', "CCG" => 'P', "CCU" => 'P',
"CGA" => 'R', "CGC" => 'R', "CGG" => 'R', "CGU" => 'R',
"CUA" => 'L', "CUC" => 'L', "CUG" => 'L', "CUU" => 'L',
"GAA" => 'E', "GAC" => 'D', "GAG" => 'E', "GAU" => 'D',
"GCA" => 'A', "GCC" => 'A', "GCG" => 'A', "GCU" => 'A',
"GGA" => 'G', "GGC" => 'G', "GGG" => 'G', "GGU" => 'G',
"GUA" => 'V', "GUC" => 'V', "GUG" => 'V', "GUU" => 'V',
"UAA" => '*', "UAC" => 'Y', "UAG" => '*', "UAU" => 'Y',
"UCA" => 'S', "UCC" => 'S', "UCG" => 'S', "UCU" => 'S',
"UGA" => '*', "UGC" => 'C', "UGG" => 'W', "UGU" => 'C',
"UUA" => 'L', "UUC" => 'F', "UUG" => 'L', "UUU" => 'F',
)
d = Dict{NTuple{3, RNA}, Char}(map(RNA, Tuple(k)) => v for (k, v) in d_)

# translatable ambiguities in the standard code
"CUN" => 'L', "CCN" => 'P', "CGN" => 'R', "ACN" => 'T',
"GUN" => 'V', "GCN" => 'A', "GGN" => 'G', "UCN" => 'S'
)
for (a, b, c) in Iterators.product(ntuple(i -> alphabet(RNA), Val{3}())...)
(a == RNA_Gap || b == RNA_Gap || c == RNA_Gap) && continue
(a, b, c) keys(d) && continue
possible = ' '
for (x, y, z) in Iterators.product(map(BioSequences.UnambiguousRNAs, (a, b, c))...)
new_possible = d[(x, y, z)]
if possible == ' '
possible = new_possible
elseif possible == new_possible
nothing
elseif possible ('J', 'I', 'L') && new_possible ('I', 'L')
possible = 'J'
elseif possible ('B', 'D', 'N') && new_possible ('D', 'N')
possible = 'B'
elseif possible ('Z', 'Q', 'E') && new_possible ('Q', 'E')
possible = 'Z'
else
possible = 'X'
break
end
end
d[(a, b, c)] = possible
end
result = Dict(join(k) => v for (k, v) in d)
for (k, v) in result
if 'U' in k
result[replace(k, 'U'=>'T')] = v
end
end
result
end

function string_translate(seq::AbstractString)
@assert length(seq) % 3 == 0
Expand All @@ -32,18 +62,6 @@
return String(aaseq)
end

function check_translate(seq::AbstractString)
return string_translate(seq) == String(translate(LongRNA{4}(seq)))
end

aas = aa""
global reps = 10
for len in [1, 10, 32, 1000, 10000, 100000]
@test all(Bool[check_translate(random_translatable_rna(len)) for _ in 1:reps])
seq = LongDNA{4}(LongRNA{4}(random_translatable_rna(len)))
@test translate!(aas, seq) == translate(seq)
end

# Basics
@test length(BioSequences.standard_genetic_code) == 64
buf = IOBuffer()
Expand All @@ -57,6 +75,26 @@
show(buf, BioSequences.ncbi_trans_table)
@test !iszero(length(take!(buf)))

# Test translation values
sampler = SamplerWeighted(rna"ACGUMRSVWYHKDBN", [fill(0.2, 4); fill(0.018181818, 10)])
for len in [3, 9, 54, 102]
for A in [DNAAlphabet{4}(), RNAAlphabet{4}()]
for attempt in 1:25
seq = randseq(A, sampler, len)
@test string(translate(seq)) == string_translate(string(seq))
for window in [1:3, 5:13, 11:67]
checkbounds(Bool, eachindex(seq), window) || continue
v = view(seq, window)
@test string(translate(v)) == string_translate(string(v))
end
end
end
for A in [DNAAlphabet{2}(), RNAAlphabet{2}()]
seq = randseq(A, len)
@test string(translate(seq)) == string_translate(string(seq))
end
end

# ambiguous codons
@test translate(rna"YUGMGG") == aa"LR"
@test translate(rna"GAYGARGAM") == aa"DEX"
Expand Down Expand Up @@ -84,4 +122,9 @@

# issue #133
@test translate(rna"GAN") == aa"X"

# Test views
seq = dna"TAGTGCNTAGDACGGGWAAABCCGTAC"
@test translate(view(seq, 1:0)) == aa""
@test translate(LongSubSeq{RNAAlphabet{4}}(seq, 2:length(seq)-2)) == translate(seq[2:end-2])
end

0 comments on commit f64918e

Please sign in to comment.