Allow translation of LongSubSeq

This also fixes an issue in the translation code, and adds another constructor to LongSubSeq that I thought was missing.
BioJulia · Sep 26, 2023 · f64918e · f64918e
1 parent b4f5970
commit f64918e
Show file tree

Hide file tree

Showing 4 changed files with 90 additions and 38 deletions.
diff --git a/src/geneticcode.jl b/src/geneticcode.jl
@@ -329,18 +329,18 @@ result in an error. For organisms that utilize alternative start codons, one
 can set `alternative_start=true`, in which case the first codon will always be
 converted to a methionine.
 """
-function translate(ntseq::LongNuc{N};
+function translate(ntseq::SeqOrView;
     code::GeneticCode = standard_genetic_code,
     allow_ambiguous_codons::Bool = true,
     alternative_start::Bool = false
-) where N
+)
     len = div((length(ntseq) % UInt) * 11, 32)
     translate!(LongAA(undef, len), ntseq; code = code,
     allow_ambiguous_codons = allow_ambiguous_codons, alternative_start = alternative_start)
 end
 
 function translate!(aaseq::LongAA,
-    ntseq::LongNuc{2};#Sequence{<:Union{DNAAlphabet{2}, RNAAlphabet{2}}};
+    ntseq::SeqOrView{<:NucleicAcidAlphabet{2}};
     code::GeneticCode = standard_genetic_code,
     allow_ambiguous_codons::Bool = true,
     alternative_start::Bool = false
@@ -360,7 +360,7 @@ function translate!(aaseq::LongAA,
 end
 
 function translate!(aaseq::LongAA,
-    ntseq::LongNuc{4};#Sequence{<:Union{DNAAlphabet{4}, RNAAlphabet{4}}};
+    ntseq::SeqOrView{<:NucleicAcidAlphabet{4}};
     code::GeneticCode = standard_genetic_code,
     allow_ambiguous_codons::Bool = true,
     alternative_start::Bool = false
@@ -401,7 +401,7 @@ function try_translate_ambiguous_codon(
         allow_ambiguous || error("codon ", a, b, c, " cannot be unambiguously translated")
         aa = if aa_new in (AA_N, AA_D) && aa in (AA_N, AA_D, AA_B)
             AA_B
-        elseif aa_new in (AA_I, AA_L) && aa in (AA_I, AA_L, AA_B)
+        elseif aa_new in (AA_I, AA_L) && aa in (AA_I, AA_L, AA_J)
             AA_J
         elseif aa_new in (AA_Q, AA_E) && aa in (AA_Q, AA_E, AA_Z)
             AA_Z

diff --git a/src/longsequences/seqview.jl b/src/longsequences/seqview.jl
@@ -93,6 +93,12 @@ function (::Type{T})(seq::LongSequence{<:NucleicAcidAlphabet{N}}) where
 	T(seq.data, 1:length(seq))
 end
 
+function (::Type{T})(seq::LongSequence{<:NucleicAcidAlphabet{N}}, part::AbstractUnitRange{<:Integer}) where
+	{N, T<:LongSubSeq{<:NucleicAcidAlphabet{N}}}
+	@boundscheck checkbounds(seq, part)
+	T(seq.data, UnitRange{Int}(part))
+end
+
 function Base.convert(::Type{T1}, seq::T2) where
 	{T1 <: Union{LongSequence, LongSubSeq}, T2 <: Union{LongSequence, LongSubSeq}}
 	return T1(seq)

diff --git a/test/longsequences/seqview.jl b/test/longsequences/seqview.jl
@@ -24,6 +24,9 @@
 	@test vv == vv2 == vv3
 
 	@test LongSubSeq{AminoAcidAlphabet}(seq) == view(seq, eachindex(seq))
+
+	s = dna"TASCTAWTA"
+	@test collect(LongSubSeq{RNAAlphabet{4}}(s, 3:7)) == collect(LongRNA{4}(s)[3:7])
 end
 
 @testset "Basics" begin

diff --git a/test/translation.jl b/test/translation.jl
@@ -1,27 +1,57 @@
 @testset "Translation" begin
     # crummy string translation to test against
-    standard_genetic_code_dict = Dict{String,Char}(
-        "AAA" => 'K', "AAC" => 'N', "AAG" => 'K', "AAU" => 'N',
-        "ACA" => 'T', "ACC" => 'T', "ACG" => 'T', "ACU" => 'T',
-        "AGA" => 'R', "AGC" => 'S', "AGG" => 'R', "AGU" => 'S',
-        "AUA" => 'I', "AUC" => 'I', "AUG" => 'M', "AUU" => 'I',
-        "CAA" => 'Q', "CAC" => 'H', "CAG" => 'Q', "CAU" => 'H',
-        "CCA" => 'P', "CCC" => 'P', "CCG" => 'P', "CCU" => 'P',
-        "CGA" => 'R', "CGC" => 'R', "CGG" => 'R', "CGU" => 'R',
-        "CUA" => 'L', "CUC" => 'L', "CUG" => 'L', "CUU" => 'L',
-        "GAA" => 'E', "GAC" => 'D', "GAG" => 'E', "GAU" => 'D',
-        "GCA" => 'A', "GCC" => 'A', "GCG" => 'A', "GCU" => 'A',
-        "GGA" => 'G', "GGC" => 'G', "GGG" => 'G', "GGU" => 'G',
-        "GUA" => 'V', "GUC" => 'V', "GUG" => 'V', "GUU" => 'V',
-        "UAA" => '*', "UAC" => 'Y', "UAG" => '*', "UAU" => 'Y',
-        "UCA" => 'S', "UCC" => 'S', "UCG" => 'S', "UCU" => 'S',
-        "UGA" => '*', "UGC" => 'C', "UGG" => 'W', "UGU" => 'C',
-        "UUA" => 'L', "UUC" => 'F', "UUG" => 'L', "UUU" => 'F',
+    standard_genetic_code_dict = let
+        d_ = Dict{String,Char}(
+            "AAA" => 'K', "AAC" => 'N', "AAG" => 'K', "AAU" => 'N',
+            "ACA" => 'T', "ACC" => 'T', "ACG" => 'T', "ACU" => 'T',
+            "AGA" => 'R', "AGC" => 'S', "AGG" => 'R', "AGU" => 'S',
+            "AUA" => 'I', "AUC" => 'I', "AUG" => 'M', "AUU" => 'I',
+            "CAA" => 'Q', "CAC" => 'H', "CAG" => 'Q', "CAU" => 'H',
+            "CCA" => 'P', "CCC" => 'P', "CCG" => 'P', "CCU" => 'P',
+            "CGA" => 'R', "CGC" => 'R', "CGG" => 'R', "CGU" => 'R',
+            "CUA" => 'L', "CUC" => 'L', "CUG" => 'L', "CUU" => 'L',
+            "GAA" => 'E', "GAC" => 'D', "GAG" => 'E', "GAU" => 'D',
+            "GCA" => 'A', "GCC" => 'A', "GCG" => 'A', "GCU" => 'A',
+            "GGA" => 'G', "GGC" => 'G', "GGG" => 'G', "GGU" => 'G',
+            "GUA" => 'V', "GUC" => 'V', "GUG" => 'V', "GUU" => 'V',
+            "UAA" => '*', "UAC" => 'Y', "UAG" => '*', "UAU" => 'Y',
+            "UCA" => 'S', "UCC" => 'S', "UCG" => 'S', "UCU" => 'S',
+            "UGA" => '*', "UGC" => 'C', "UGG" => 'W', "UGU" => 'C',
+            "UUA" => 'L', "UUC" => 'F', "UUG" => 'L', "UUU" => 'F',
+        )
+        d = Dict{NTuple{3, RNA}, Char}(map(RNA, Tuple(k)) => v for (k, v) in d_)
 
-        # translatable ambiguities in the standard code
-        "CUN" => 'L', "CCN" => 'P', "CGN" => 'R', "ACN" => 'T',
-        "GUN" => 'V', "GCN" => 'A', "GGN" => 'G', "UCN" => 'S'
-    )
+        for (a, b, c) in Iterators.product(ntuple(i -> alphabet(RNA), Val{3}())...)
+            (a == RNA_Gap || b == RNA_Gap || c == RNA_Gap) && continue
+            (a, b, c) ∈ keys(d) && continue
+            possible = ' '
+            for (x, y, z) in Iterators.product(map(BioSequences.UnambiguousRNAs, (a, b, c))...)
+                new_possible = d[(x, y, z)]
+                if possible == ' '
+                    possible = new_possible
+                elseif possible == new_possible
+                    nothing
+                elseif possible ∈ ('J', 'I', 'L') && new_possible ∈ ('I', 'L')
+                    possible = 'J'
+                elseif possible ∈ ('B', 'D', 'N') && new_possible ∈ ('D', 'N')
+                    possible = 'B'
+                elseif possible ∈ ('Z', 'Q', 'E') && new_possible ∈ ('Q', 'E')
+                    possible = 'Z'
+                else
+                    possible = 'X'
+                    break
+                end
+            end
+            d[(a, b, c)] = possible
+        end
+        result = Dict(join(k) => v for (k, v) in d)
+        for (k, v) in result
+            if 'U' in k
+                result[replace(k, 'U'=>'T')] = v
+            end
+        end
+        result
+    end
 
     function string_translate(seq::AbstractString)
         @assert length(seq) % 3 == 0
@@ -32,18 +62,6 @@
         return String(aaseq)
     end
 
-    function check_translate(seq::AbstractString)
-        return string_translate(seq) == String(translate(LongRNA{4}(seq)))
-    end
-
-    aas = aa""
-    global reps = 10
-    for len in [1, 10, 32, 1000, 10000, 100000]
-        @test all(Bool[check_translate(random_translatable_rna(len)) for _ in 1:reps])
-        seq = LongDNA{4}(LongRNA{4}(random_translatable_rna(len)))
-        @test translate!(aas, seq) == translate(seq)
-    end
-
     # Basics
     @test length(BioSequences.standard_genetic_code) == 64
     buf = IOBuffer()
@@ -57,6 +75,26 @@
     show(buf, BioSequences.ncbi_trans_table)
     @test !iszero(length(take!(buf)))
 
+    # Test translation values
+    sampler = SamplerWeighted(rna"ACGUMRSVWYHKDBN", [fill(0.2, 4); fill(0.018181818, 10)])
+    for len in [3, 9, 54, 102]
+        for A in [DNAAlphabet{4}(), RNAAlphabet{4}()]
+            for attempt in 1:25
+                seq = randseq(A, sampler, len)
+                @test string(translate(seq)) == string_translate(string(seq))
+                for window in [1:3, 5:13, 11:67]
+                    checkbounds(Bool, eachindex(seq), window) || continue
+                    v = view(seq, window)
+                    @test string(translate(v)) == string_translate(string(v))
+                end
+            end
+        end
+        for A in [DNAAlphabet{2}(), RNAAlphabet{2}()]
+            seq = randseq(A, len)
+            @test string(translate(seq)) == string_translate(string(seq))
+        end
+    end
+
     # ambiguous codons
     @test translate(rna"YUGMGG") == aa"LR"
     @test translate(rna"GAYGARGAM") == aa"DEX"
@@ -84,4 +122,9 @@
 
     # issue #133
     @test translate(rna"GAN") == aa"X"
+
+    # Test views
+    seq = dna"TAGTGCNTAGDACGGGWAAABCCGTAC"
+    @test translate(view(seq, 1:0)) == aa""
+    @test translate(LongSubSeq{RNAAlphabet{4}}(seq, 2:length(seq)-2)) == translate(seq[2:end-2])
 end