Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

WIP: Add tryparse #241

Draft
wants to merge 3 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions src/BioSequences.jl
Original file line number Diff line number Diff line change
Expand Up @@ -202,6 +202,9 @@ import Twiddle: enumerate_nibbles,
repeatpattern
using Random

const SeqLike = Union{AbstractVector, AbstractString}
const ASCIILike = Union{String, SubString{String}}

include("alphabet.jl")

# Load the bit-twiddling internals that optimised BioSequences methods depend on.
Expand Down
23 changes: 11 additions & 12 deletions src/alphabet.jl
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,9 @@ iscomplete(A::Alphabet) = Val(length(symbols(A)) === 1 << bits_per_symbol(A))
Encode BioSymbol `S` to an internal representation using an `Alphabet`.
This decoding is checked to enforce valid data element.
"""
function encode end
function encode(A::Alphabet, x)
@something tryencode(A, x) throw(EncodeError(A, x))
end

struct EncodeError{A<:Alphabet,T} <: Exception
val::T
Expand Down Expand Up @@ -167,11 +169,11 @@ for A in (DNAAlphabet, RNAAlphabet)
@eval begin

# 2-bit encoding
@inline function encode(::$(A){2}, nt::$(T))
@inline function tryencode(::$(A){2}, nt::$(T))
if count_ones(nt) != 1 || !isvalid(nt)
throw(EncodeError($(A){2}(), nt))
return nothing
end
return convert(UInt, @inbounds twobitnucs[reinterpret(UInt8, nt) + 0x01])
convert(UInt, @inbounds twobitnucs[reinterpret(UInt8, nt) + 0x01])
end

@inline function decode(::$(A){2}, x::UInt)
Expand All @@ -181,11 +183,8 @@ for A in (DNAAlphabet, RNAAlphabet)
@inline decode(::$(A){2}, x::Unsigned) = decode($(A){2}(), UInt(x))

# 4-bit encoding
@inline function encode(::$(A){4}, nt::$(T))
if !isvalid(nt)
throw(EncodeError($(A){4}(), nt))
end
return convert(UInt, reinterpret(UInt8, nt))
@inline function tryencode(::$(A){4}, nt::$(T))
isvalid(nt) ? convert(UInt, reinterpret(UInt8, nt)) : nothing
end

@inline function decode(::$(A){4}, x::UInt)
Expand Down Expand Up @@ -220,11 +219,11 @@ function symbols(::AminoAcidAlphabet)
AA_Y, AA_V, AA_O, AA_U, AA_B, AA_J, AA_Z, AA_X, AA_Term, AA_Gap)
end

@inline function encode(::AminoAcidAlphabet, aa::AminoAcid)
@inline function tryencode(::AminoAcidAlphabet, aa::AminoAcid)
if reinterpret(UInt8, aa) > reinterpret(UInt8, AA_Gap)
throw(EncodeError(AminoAcidAlphabet(), aa))
return nothing
end
return convert(UInt, reinterpret(UInt8, aa))
convert(UInt, reinterpret(UInt8, aa))
end

@inline function decode(::AminoAcidAlphabet, x::UInt)
Expand Down
48 changes: 33 additions & 15 deletions src/longsequences/constructors.jl
Original file line number Diff line number Diff line change
Expand Up @@ -60,20 +60,8 @@
end

# Constructors from strings
function LongSequence{A}(s::Union{String, SubString{String}}) where {A<:Alphabet}
return LongSequence{A}(s, codetype(A()))
end

# Generic method for String/Substring.
function LongSequence{A}(s::Union{String, SubString{String}}, ::AlphabetCode) where {A<:Alphabet}
len = length(s)
seq = LongSequence{A}(undef, len)
return copyto!(seq, 1, s, 1, len)
end

function LongSequence{A}(s::Union{String, SubString{String}}, ::AsciiAlphabet) where {A<:Alphabet}
seq = LongSequence{A}(undef, ncodeunits(s))
return encode_chunks!(seq, 1, codeunits(s), 1, ncodeunits(s))
function LongSequence{A}(s::AbstractString) where {A <: Alphabet}
return parse(LongSequence{A}, s)
end

function LongSequence{A}(
Expand All @@ -85,4 +73,34 @@
return copyto!(seq, 1, src, first(part), len)
end

Base.parse(::Type{LongSequence{A}}, seq::AbstractString) where A = LongSequence{A}(seq)
Base.parse(::Type{T}, s::AbstractString) where {T <: LongSequence} = parse(T, String(s))

function Base.parse(T::Type{LongSequence{A}}, s::ASCIILike) where {A<:Alphabet}
C = codetype(A())
src = C isa AsciiAlphabet ? codeunits(s) : s
n = _tryparse(T, s, C)
if n isa Int
throw_encode_error(A(), src, n)
else
n
end
end

Base.tryparse(::Type{T}, s::AbstractString) where {T <: LongSequence} = tryparse(T, String(s))

Check warning on line 89 in src/longsequences/constructors.jl

View check run for this annotation

Codecov / codecov/patch

src/longsequences/constructors.jl#L89

Added line #L89 was not covered by tests

function Base.tryparse(::Type{LongSequence{A}}, s::ASCIILike) where {A <: Alphabet}
n = _tryparse(LongSequence{A}, s, codetype(A()))
n isa Int ? nothing : n

Check warning on line 93 in src/longsequences/constructors.jl

View check run for this annotation

Codecov / codecov/patch

src/longsequences/constructors.jl#L91-L93

Added lines #L91 - L93 were not covered by tests
end

function _tryparse(::Type{LongSequence{A}}, s::ASCIILike, ::AlphabetCode) where {A<:Alphabet}
len = length(s)
seq = LongSequence{A}(undef, len)
# TODO!
return copyto!(seq, 1, s, 1, len)
end

function _tryparse(::Type{LongSequence{A}}, s::ASCIILike, ::AsciiAlphabet) where {A<:Alphabet}
seq = LongSequence{A}(undef, ncodeunits(s))
try_encode_chunks!(seq, 1, codeunits(s), 1, ncodeunits(s))
end
46 changes: 37 additions & 9 deletions src/longsequences/copying.jl
Original file line number Diff line number Diff line change
Expand Up @@ -107,8 +107,6 @@ function _copyto!(dst::SeqOrView{A}, doff::Integer,
end

#########
const SeqLike = Union{AbstractVector, AbstractString}
const ASCIILike = Union{String, SubString{String}}

"""
copy!(dst::LongSequence, src)
Expand Down Expand Up @@ -167,34 +165,64 @@ end
@assert false "Expected error in encoding"
end

@inline function encode_chunk(A::Alphabet, src::AbstractArray{UInt8}, soff::Integer, N::Integer)
@inline function encode_chunk(
A::Alphabet,
src::AbstractArray{UInt8},
soff::Integer,
N::Integer
)::Union{UInt64, Int}
chunk = zero(UInt64)
check = 0x00
@inbounds for i in 1:N
enc = ascii_encode(A, src[soff+i-1])
check |= enc
chunk |= UInt64(enc) << (bits_per_symbol(A) * (i-1))
end
check & 0x80 == 0x00 || throw_encode_error(A, src, soff)
return chunk
check & 0x80 == 0x00 || return Int(soff)::Int
return chunk::UInt64
end

# Use this for AsiiAlphabet alphabets only, internal use only, no boundschecks.
# This is preferential to `copyto!` if none of the sequence's original content
# needs to be kept, since this is faster.
function encode_chunks!(dst::SeqOrView{A}, startindex::Integer, src::AbstractVector{UInt8},
soff::Integer, N::Integer) where {A <: Alphabet}
function try_encode_chunks!(
dst::SeqOrView{A},
startindex::Integer,
src::AbstractVector{UInt8},
soff::Integer,
N::Integer
)::Union{Int, SeqOrView} where {A <: Alphabet}
chunks, rest = divrem(N, symbols_per_data_element(dst))
@inbounds for i in startindex:startindex+chunks-1
dst.data[i] = encode_chunk(A(), src, soff, symbols_per_data_element(dst))
chunk = encode_chunk(A(), src, soff, symbols_per_data_element(dst))
if chunk isa Int
return chunk
else
dst.data[i] = chunk
end
soff += symbols_per_data_element(dst)
end
@inbounds if !iszero(rest)
dst.data[startindex+chunks] = encode_chunk(A(), src, soff, rest)
chunk = encode_chunk(A(), src, soff, rest)
if chunk isa Int
return chunk
else
dst.data[startindex+chunks] = chunk
end
end
return dst
end

function encode_chunks!(dst::SeqOrView{A},
startindex::Integer,
src::AbstractVector{UInt8},
soff::Integer,
N::Integer
)::SeqOrView where {A <: Alphabet}
s = try_encode_chunks!(dst, startindex, src, soff, N)
s isa Int ? throw_encode_error(A(), src, s) : s
end

#########

# Two-argument method
Expand Down
Loading