diff --git a/src/BioSequences.jl b/src/BioSequences.jl index 111d33b1..c17599b7 100644 --- a/src/BioSequences.jl +++ b/src/BioSequences.jl @@ -202,6 +202,9 @@ import Twiddle: enumerate_nibbles, repeatpattern using Random +const SeqLike = Union{AbstractVector, AbstractString} +const ASCIILike = Union{String, SubString{String}} + include("alphabet.jl") # Load the bit-twiddling internals that optimised BioSequences methods depend on. diff --git a/src/alphabet.jl b/src/alphabet.jl index 06b7ffa6..06279a18 100644 --- a/src/alphabet.jl +++ b/src/alphabet.jl @@ -85,7 +85,9 @@ iscomplete(A::Alphabet) = Val(length(symbols(A)) === 1 << bits_per_symbol(A)) Encode BioSymbol `S` to an internal representation using an `Alphabet`. This decoding is checked to enforce valid data element. """ -function encode end +function encode(A::Alphabet, x) + @something tryencode(A, x) throw(EncodeError(A, x)) +end struct EncodeError{A<:Alphabet,T} <: Exception val::T @@ -167,11 +169,11 @@ for A in (DNAAlphabet, RNAAlphabet) @eval begin # 2-bit encoding - @inline function encode(::$(A){2}, nt::$(T)) + @inline function tryencode(::$(A){2}, nt::$(T)) if count_ones(nt) != 1 || !isvalid(nt) - throw(EncodeError($(A){2}(), nt)) + return nothing end - return convert(UInt, @inbounds twobitnucs[reinterpret(UInt8, nt) + 0x01]) + convert(UInt, @inbounds twobitnucs[reinterpret(UInt8, nt) + 0x01]) end @inline function decode(::$(A){2}, x::UInt) @@ -181,11 +183,8 @@ for A in (DNAAlphabet, RNAAlphabet) @inline decode(::$(A){2}, x::Unsigned) = decode($(A){2}(), UInt(x)) # 4-bit encoding - @inline function encode(::$(A){4}, nt::$(T)) - if !isvalid(nt) - throw(EncodeError($(A){4}(), nt)) - end - return convert(UInt, reinterpret(UInt8, nt)) + @inline function tryencode(::$(A){4}, nt::$(T)) + isvalid(nt) ? convert(UInt, reinterpret(UInt8, nt)) : nothing end @inline function decode(::$(A){4}, x::UInt) @@ -220,11 +219,11 @@ function symbols(::AminoAcidAlphabet) AA_Y, AA_V, AA_O, AA_U, AA_B, AA_J, AA_Z, AA_X, AA_Term, AA_Gap) end -@inline function encode(::AminoAcidAlphabet, aa::AminoAcid) +@inline function tryencode(::AminoAcidAlphabet, aa::AminoAcid) if reinterpret(UInt8, aa) > reinterpret(UInt8, AA_Gap) - throw(EncodeError(AminoAcidAlphabet(), aa)) + return nothing end - return convert(UInt, reinterpret(UInt8, aa)) + convert(UInt, reinterpret(UInt8, aa)) end @inline function decode(::AminoAcidAlphabet, x::UInt) diff --git a/src/longsequences/constructors.jl b/src/longsequences/constructors.jl index abdd2f94..80d01858 100644 --- a/src/longsequences/constructors.jl +++ b/src/longsequences/constructors.jl @@ -60,20 +60,8 @@ function (::Type{T})(seq::LongSequence{<:NucleicAcidAlphabet{N}}) where end # Constructors from strings -function LongSequence{A}(s::Union{String, SubString{String}}) where {A<:Alphabet} - return LongSequence{A}(s, codetype(A())) -end - -# Generic method for String/Substring. -function LongSequence{A}(s::Union{String, SubString{String}}, ::AlphabetCode) where {A<:Alphabet} - len = length(s) - seq = LongSequence{A}(undef, len) - return copyto!(seq, 1, s, 1, len) -end - -function LongSequence{A}(s::Union{String, SubString{String}}, ::AsciiAlphabet) where {A<:Alphabet} - seq = LongSequence{A}(undef, ncodeunits(s)) - return encode_chunks!(seq, 1, codeunits(s), 1, ncodeunits(s)) +function LongSequence{A}(s::AbstractString) where {A <: Alphabet} + return parse(LongSequence{A}, s) end function LongSequence{A}( @@ -85,4 +73,34 @@ function LongSequence{A}( return copyto!(seq, 1, src, first(part), len) end -Base.parse(::Type{LongSequence{A}}, seq::AbstractString) where A = LongSequence{A}(seq) \ No newline at end of file +Base.parse(::Type{T}, s::AbstractString) where {T <: LongSequence} = parse(T, String(s)) + +function Base.parse(T::Type{LongSequence{A}}, s::ASCIILike) where {A<:Alphabet} + C = codetype(A()) + src = C isa AsciiAlphabet ? codeunits(s) : s + n = _tryparse(T, s, C) + if n isa Int + throw_encode_error(A(), src, n) + else + n + end +end + +Base.tryparse(::Type{T}, s::AbstractString) where {T <: LongSequence} = tryparse(T, String(s)) + +function Base.tryparse(::Type{LongSequence{A}}, s::ASCIILike) where {A <: Alphabet} + n = _tryparse(LongSequence{A}, s, codetype(A())) + n isa Int ? nothing : n +end + +function _tryparse(::Type{LongSequence{A}}, s::ASCIILike, ::AlphabetCode) where {A<:Alphabet} + len = length(s) + seq = LongSequence{A}(undef, len) + # TODO! + return copyto!(seq, 1, s, 1, len) +end + +function _tryparse(::Type{LongSequence{A}}, s::ASCIILike, ::AsciiAlphabet) where {A<:Alphabet} + seq = LongSequence{A}(undef, ncodeunits(s)) + try_encode_chunks!(seq, 1, codeunits(s), 1, ncodeunits(s)) +end \ No newline at end of file diff --git a/src/longsequences/copying.jl b/src/longsequences/copying.jl index b98e0360..d18a73e6 100644 --- a/src/longsequences/copying.jl +++ b/src/longsequences/copying.jl @@ -107,8 +107,6 @@ function _copyto!(dst::SeqOrView{A}, doff::Integer, end ######### -const SeqLike = Union{AbstractVector, AbstractString} -const ASCIILike = Union{String, SubString{String}} """ copy!(dst::LongSequence, src) @@ -167,7 +165,12 @@ end @assert false "Expected error in encoding" end -@inline function encode_chunk(A::Alphabet, src::AbstractArray{UInt8}, soff::Integer, N::Integer) +@inline function encode_chunk( + A::Alphabet, + src::AbstractArray{UInt8}, + soff::Integer, + N::Integer +)::Union{UInt64, Int} chunk = zero(UInt64) check = 0x00 @inbounds for i in 1:N @@ -175,26 +178,51 @@ end check |= enc chunk |= UInt64(enc) << (bits_per_symbol(A) * (i-1)) end - check & 0x80 == 0x00 || throw_encode_error(A, src, soff) - return chunk + check & 0x80 == 0x00 || return Int(soff)::Int + return chunk::UInt64 end # Use this for AsiiAlphabet alphabets only, internal use only, no boundschecks. # This is preferential to `copyto!` if none of the sequence's original content # needs to be kept, since this is faster. -function encode_chunks!(dst::SeqOrView{A}, startindex::Integer, src::AbstractVector{UInt8}, - soff::Integer, N::Integer) where {A <: Alphabet} +function try_encode_chunks!( + dst::SeqOrView{A}, + startindex::Integer, + src::AbstractVector{UInt8}, + soff::Integer, + N::Integer +)::Union{Int, SeqOrView} where {A <: Alphabet} chunks, rest = divrem(N, symbols_per_data_element(dst)) @inbounds for i in startindex:startindex+chunks-1 - dst.data[i] = encode_chunk(A(), src, soff, symbols_per_data_element(dst)) + chunk = encode_chunk(A(), src, soff, symbols_per_data_element(dst)) + if chunk isa Int + return chunk + else + dst.data[i] = chunk + end soff += symbols_per_data_element(dst) end @inbounds if !iszero(rest) - dst.data[startindex+chunks] = encode_chunk(A(), src, soff, rest) + chunk = encode_chunk(A(), src, soff, rest) + if chunk isa Int + return chunk + else + dst.data[startindex+chunks] = chunk + end end return dst end +function encode_chunks!(dst::SeqOrView{A}, + startindex::Integer, + src::AbstractVector{UInt8}, + soff::Integer, + N::Integer +)::SeqOrView where {A <: Alphabet} + s = try_encode_chunks!(dst, startindex, src, soff, N) + s isa Int ? throw_encode_error(A(), src, s) : s +end + ######### # Two-argument method