diff --git a/src/geneticcode.jl b/src/geneticcode.jl index de3da73a..e98df9ef 100644 --- a/src/geneticcode.jl +++ b/src/geneticcode.jl @@ -329,18 +329,18 @@ result in an error. For organisms that utilize alternative start codons, one can set `alternative_start=true`, in which case the first codon will always be converted to a methionine. """ -function translate(ntseq::LongNuc{N}; +function translate(ntseq::SeqOrView; code::GeneticCode = standard_genetic_code, allow_ambiguous_codons::Bool = true, alternative_start::Bool = false -) where N +) len = div((length(ntseq) % UInt) * 11, 32) translate!(LongAA(undef, len), ntseq; code = code, allow_ambiguous_codons = allow_ambiguous_codons, alternative_start = alternative_start) end function translate!(aaseq::LongAA, - ntseq::LongNuc{2};#Sequence{<:Union{DNAAlphabet{2}, RNAAlphabet{2}}}; + ntseq::SeqOrView{<:NucleicAcidAlphabet{2}}; code::GeneticCode = standard_genetic_code, allow_ambiguous_codons::Bool = true, alternative_start::Bool = false @@ -360,7 +360,7 @@ function translate!(aaseq::LongAA, end function translate!(aaseq::LongAA, - ntseq::LongNuc{4};#Sequence{<:Union{DNAAlphabet{4}, RNAAlphabet{4}}}; + ntseq::SeqOrView{<:NucleicAcidAlphabet{4}}; code::GeneticCode = standard_genetic_code, allow_ambiguous_codons::Bool = true, alternative_start::Bool = false @@ -401,7 +401,7 @@ function try_translate_ambiguous_codon( allow_ambiguous || error("codon ", a, b, c, " cannot be unambiguously translated") aa = if aa_new in (AA_N, AA_D) && aa in (AA_N, AA_D, AA_B) AA_B - elseif aa_new in (AA_I, AA_L) && aa in (AA_I, AA_L, AA_B) + elseif aa_new in (AA_I, AA_L) && aa in (AA_I, AA_L, AA_J) AA_J elseif aa_new in (AA_Q, AA_E) && aa in (AA_Q, AA_E, AA_Z) AA_Z diff --git a/src/longsequences/seqview.jl b/src/longsequences/seqview.jl index 63ab5b55..6f17dc31 100644 --- a/src/longsequences/seqview.jl +++ b/src/longsequences/seqview.jl @@ -93,6 +93,12 @@ function (::Type{T})(seq::LongSequence{<:NucleicAcidAlphabet{N}}) where T(seq.data, 1:length(seq)) end +function (::Type{T})(seq::LongSequence{<:NucleicAcidAlphabet{N}}, part::AbstractUnitRange{<:Integer}) where + {N, T<:LongSubSeq{<:NucleicAcidAlphabet{N}}} + @boundscheck checkbounds(seq, part) + T(seq.data, UnitRange{Int}(part)) +end + function Base.convert(::Type{T1}, seq::T2) where {T1 <: Union{LongSequence, LongSubSeq}, T2 <: Union{LongSequence, LongSubSeq}} return T1(seq) diff --git a/test/longsequences/seqview.jl b/test/longsequences/seqview.jl index 5d66ab74..777a8c35 100644 --- a/test/longsequences/seqview.jl +++ b/test/longsequences/seqview.jl @@ -24,6 +24,9 @@ @test vv == vv2 == vv3 @test LongSubSeq{AminoAcidAlphabet}(seq) == view(seq, eachindex(seq)) + + s = dna"TASCTAWTA" + @test collect(LongSubSeq{RNAAlphabet{4}}(s, 3:7)) == collect(LongRNA{4}(s)[3:7]) end @testset "Basics" begin diff --git a/test/translation.jl b/test/translation.jl index f4d744b4..ce4f19ba 100644 --- a/test/translation.jl +++ b/test/translation.jl @@ -1,27 +1,57 @@ @testset "Translation" begin # crummy string translation to test against - standard_genetic_code_dict = Dict{String,Char}( - "AAA" => 'K', "AAC" => 'N', "AAG" => 'K', "AAU" => 'N', - "ACA" => 'T', "ACC" => 'T', "ACG" => 'T', "ACU" => 'T', - "AGA" => 'R', "AGC" => 'S', "AGG" => 'R', "AGU" => 'S', - "AUA" => 'I', "AUC" => 'I', "AUG" => 'M', "AUU" => 'I', - "CAA" => 'Q', "CAC" => 'H', "CAG" => 'Q', "CAU" => 'H', - "CCA" => 'P', "CCC" => 'P', "CCG" => 'P', "CCU" => 'P', - "CGA" => 'R', "CGC" => 'R', "CGG" => 'R', "CGU" => 'R', - "CUA" => 'L', "CUC" => 'L', "CUG" => 'L', "CUU" => 'L', - "GAA" => 'E', "GAC" => 'D', "GAG" => 'E', "GAU" => 'D', - "GCA" => 'A', "GCC" => 'A', "GCG" => 'A', "GCU" => 'A', - "GGA" => 'G', "GGC" => 'G', "GGG" => 'G', "GGU" => 'G', - "GUA" => 'V', "GUC" => 'V', "GUG" => 'V', "GUU" => 'V', - "UAA" => '*', "UAC" => 'Y', "UAG" => '*', "UAU" => 'Y', - "UCA" => 'S', "UCC" => 'S', "UCG" => 'S', "UCU" => 'S', - "UGA" => '*', "UGC" => 'C', "UGG" => 'W', "UGU" => 'C', - "UUA" => 'L', "UUC" => 'F', "UUG" => 'L', "UUU" => 'F', + standard_genetic_code_dict = let + d_ = Dict{String,Char}( + "AAA" => 'K', "AAC" => 'N', "AAG" => 'K', "AAU" => 'N', + "ACA" => 'T', "ACC" => 'T', "ACG" => 'T', "ACU" => 'T', + "AGA" => 'R', "AGC" => 'S', "AGG" => 'R', "AGU" => 'S', + "AUA" => 'I', "AUC" => 'I', "AUG" => 'M', "AUU" => 'I', + "CAA" => 'Q', "CAC" => 'H', "CAG" => 'Q', "CAU" => 'H', + "CCA" => 'P', "CCC" => 'P', "CCG" => 'P', "CCU" => 'P', + "CGA" => 'R', "CGC" => 'R', "CGG" => 'R', "CGU" => 'R', + "CUA" => 'L', "CUC" => 'L', "CUG" => 'L', "CUU" => 'L', + "GAA" => 'E', "GAC" => 'D', "GAG" => 'E', "GAU" => 'D', + "GCA" => 'A', "GCC" => 'A', "GCG" => 'A', "GCU" => 'A', + "GGA" => 'G', "GGC" => 'G', "GGG" => 'G', "GGU" => 'G', + "GUA" => 'V', "GUC" => 'V', "GUG" => 'V', "GUU" => 'V', + "UAA" => '*', "UAC" => 'Y', "UAG" => '*', "UAU" => 'Y', + "UCA" => 'S', "UCC" => 'S', "UCG" => 'S', "UCU" => 'S', + "UGA" => '*', "UGC" => 'C', "UGG" => 'W', "UGU" => 'C', + "UUA" => 'L', "UUC" => 'F', "UUG" => 'L', "UUU" => 'F', + ) + d = Dict{NTuple{3, RNA}, Char}(map(RNA, Tuple(k)) => v for (k, v) in d_) - # translatable ambiguities in the standard code - "CUN" => 'L', "CCN" => 'P', "CGN" => 'R', "ACN" => 'T', - "GUN" => 'V', "GCN" => 'A', "GGN" => 'G', "UCN" => 'S' - ) + for (a, b, c) in Iterators.product(ntuple(i -> alphabet(RNA), Val{3}())...) + (a == RNA_Gap || b == RNA_Gap || c == RNA_Gap) && continue + (a, b, c) ∈ keys(d) && continue + possible = ' ' + for (x, y, z) in Iterators.product(map(BioSequences.UnambiguousRNAs, (a, b, c))...) + new_possible = d[(x, y, z)] + if possible == ' ' + possible = new_possible + elseif possible == new_possible + nothing + elseif possible ∈ ('J', 'I', 'L') && new_possible ∈ ('I', 'L') + possible = 'J' + elseif possible ∈ ('B', 'D', 'N') && new_possible ∈ ('D', 'N') + possible = 'B' + elseif possible ∈ ('Z', 'Q', 'E') && new_possible ∈ ('Q', 'E') + possible = 'Z' + else + possible = 'X' + break + end + end + d[(a, b, c)] = possible + end + result = Dict(join(k) => v for (k, v) in d) + for (k, v) in result + if 'U' in k + result[replace(k, 'U'=>'T')] = v + end + end + result + end function string_translate(seq::AbstractString) @assert length(seq) % 3 == 0 @@ -32,18 +62,6 @@ return String(aaseq) end - function check_translate(seq::AbstractString) - return string_translate(seq) == String(translate(LongRNA{4}(seq))) - end - - aas = aa"" - global reps = 10 - for len in [1, 10, 32, 1000, 10000, 100000] - @test all(Bool[check_translate(random_translatable_rna(len)) for _ in 1:reps]) - seq = LongDNA{4}(LongRNA{4}(random_translatable_rna(len))) - @test translate!(aas, seq) == translate(seq) - end - # Basics @test length(BioSequences.standard_genetic_code) == 64 buf = IOBuffer() @@ -57,6 +75,26 @@ show(buf, BioSequences.ncbi_trans_table) @test !iszero(length(take!(buf))) + # Test translation values + sampler = SamplerWeighted(rna"ACGUMRSVWYHKDBN", [fill(0.2, 4); fill(0.018181818, 10)]) + for len in [3, 9, 54, 102] + for A in [DNAAlphabet{4}(), RNAAlphabet{4}()] + for attempt in 1:25 + seq = randseq(A, sampler, len) + @test string(translate(seq)) == string_translate(string(seq)) + for window in [1:3, 5:13, 11:67] + checkbounds(Bool, eachindex(seq), window) || continue + v = view(seq, window) + @test string(translate(v)) == string_translate(string(v)) + end + end + end + for A in [DNAAlphabet{2}(), RNAAlphabet{2}()] + seq = randseq(A, len) + @test string(translate(seq)) == string_translate(string(seq)) + end + end + # ambiguous codons @test translate(rna"YUGMGG") == aa"LR" @test translate(rna"GAYGARGAM") == aa"DEX" @@ -84,4 +122,9 @@ # issue #133 @test translate(rna"GAN") == aa"X" + + # Test views + seq = dna"TAGTGCNTAGDACGGGWAAABCCGTAC" + @test translate(view(seq, 1:0)) == aa"" + @test translate(LongSubSeq{RNAAlphabet{4}}(seq, 2:length(seq)-2)) == translate(seq[2:end-2]) end