From 700b564136dacba83562c1c9b67ec77bb6a6ca6a Mon Sep 17 00:00:00 2001 From: Jakob Nybo Nissen Date: Sun, 23 Jul 2023 12:55:14 +0200 Subject: [PATCH 1/2] WIP: Kmers.jl compatibility --- src/alphabet.jl | 10 +++++++- src/biosequence/biosequence.jl | 5 ++-- src/bit-manipulation/bit-manipulation.jl | 29 ++++++++++++++++++++---- 3 files changed, 37 insertions(+), 7 deletions(-) diff --git a/src/alphabet.jl b/src/alphabet.jl index 790a822f..2fa8a5d8 100644 --- a/src/alphabet.jl +++ b/src/alphabet.jl @@ -111,7 +111,15 @@ end EncodeError(::A, val::T) where {A,T} = EncodeError{A,T}(val) function Base.showerror(io::IO, err::EncodeError{A}) where {A} - print(io, "cannot encode ", repr(err.val), " in ", A) + val = err.val + char_repr = if val isa Integer && val < 0x80 + repr(val) * " (Char '" * Char(val) * "')" + elseif val isa Union{AbstractString, AbstractChar} + repr(val) + else + string(val) + end + print(io, "cannot encode " * char_repr * " in ", A) end """ diff --git a/src/biosequence/biosequence.jl b/src/biosequence/biosequence.jl index 93fd4963..3cec4951 100644 --- a/src/biosequence/biosequence.jl +++ b/src/biosequence/biosequence.jl @@ -60,7 +60,7 @@ function has_interface( isempty(syms) && error("Vector syms must not be empty") first(syms) isa eltype(T) || error("Vector is of wrong element type") seq = T((i for i in syms)) - length(seq) > 0 || return false + length(seq) == length(syms) || return false eachindex(seq) === Base.OneTo(length(seq)) || return false E = encoded_data_eltype(T) e = extract_encoded_element(seq, 1) @@ -87,13 +87,14 @@ Base.nextind(::BioSequence, i::Integer) = Int(i) + 1 Base.prevind(::BioSequence, i::Integer) = Int(i) - 1 Base.size(x::BioSequence) = (length(x),) Base.eltype(::Type{<:BioSequence{A}}) where {A <: Alphabet} = eltype(A) -Base.eltype(x::BioSequence) = eltype(typeof(x)) Alphabet(::Type{<:BioSequence{A}}) where {A <: Alphabet} = A() Alphabet(x::BioSequence) = Alphabet(typeof(x)) Base.isempty(x::BioSequence) = iszero(length(x)) Base.empty(::Type{T}) where {T <: BioSequence} = T(eltype(T)[]) Base.empty(x::BioSequence) = empty(typeof(x)) BitsPerSymbol(x::BioSequence) = BitsPerSymbol(Alphabet(typeof(x))) +bits_per_symbol(::Type{T}) where {T <: BioSequence} = bits_per_symbol(Alphabet(T)) +bits_per_symbol(x::BioSequence) = bits_per_symbol(typeof(x)) Base.hash(s::BioSequence, x::UInt) = foldl((a, b) -> hash(b, a), s, init=x) function Base.similar(seq::BioSequence, len::Integer=length(seq)) diff --git a/src/bit-manipulation/bit-manipulation.jl b/src/bit-manipulation/bit-manipulation.jl index e40c6e1e..253bdfb0 100644 --- a/src/bit-manipulation/bit-manipulation.jl +++ b/src/bit-manipulation/bit-manipulation.jl @@ -1,16 +1,37 @@ -@inline function reversebits(x::T, ::BitsPerSymbol{2}) where T <: Base.BitUnsigned + +include("bitindex.jl") + +const BitUnsigned = Union{UInt8, UInt16, UInt32, UInt64, UInt128} + +@inline function reversebits(x::T, ::BitsPerSymbol{2}) where T <: BitUnsigned mask = 0x33333333333333333333333333333333 % T x = ((x >> 2) & mask) | ((x & mask) << 2) return reversebits(x, BitsPerSymbol{4}()) end -@inline function reversebits(x::T, ::BitsPerSymbol{4}) where T <: Base.BitUnsigned +@inline function reversebits(x::T, ::BitsPerSymbol{4}) where T <: BitUnsigned mask = 0x0F0F0F0F0F0F0F0F0F0F0F0F0F0F0F0F % T x = ((x >> 4) & mask) | ((x & mask) << 4) - return bswap(x) + return reversebits(x, BitsPerSymbol{8}()) +end + +@inline reversebits(x::T, ::BitsPerSymbol{8}) where T <: BitUnsigned = bswap(x) + +@inline reversebits(x::UInt16, ::BitsPerSymbol{16}) = x +@inline function reversebits(x::T, ::BitsPerSymbol{16}) where T <: Union{UInt32, UInt64} + mask = 0x0000FFFF0000FFFF0000FFFF0000FFFF % T + x = ((x >> 16) & mask) | ((x & mask) << 16) + reversebits(x, BitsPerSymbol{32}()) +end + +@inline reversebits(x::UInt32, ::BitsPerSymbol{32}) = x +@inline function reversebits(x::T, ::BitsPerSymbol{32}) where T <: Union{UInt64} + mask = 0x00000000FFFFFFF00000000FFFFFFFF % T + x = ((x >> 32) & mask) | ((x & mask) << 32) + reversebits(x, BitsPerSymbol{64}()) end -reversebits(x::T, ::BitsPerSymbol{8}) where T <: Base.BitUnsigned = bswap(x) +@inline reversebits(x::UInt64, ::BitsPerSymbol{64}) = x @inline function complement_bitpar(x::Unsigned, ::T) where {T<:NucleicAcidAlphabet{2}} return ~x From c334880123ad6e6d977a35f2521cb35196a3fe61 Mon Sep 17 00:00:00 2001 From: Jakob Nybo Nissen Date: Sun, 21 Jan 2024 17:10:28 +0100 Subject: [PATCH 2/2] WIP: firstbitindex --- src/biosequence/indexing.jl | 3 +++ src/bit-manipulation/bit-manipulation.jl | 2 -- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/src/biosequence/indexing.jl b/src/biosequence/indexing.jl index 4a87059c..c2634965 100644 --- a/src/biosequence/indexing.jl +++ b/src/biosequence/indexing.jl @@ -11,6 +11,9 @@ (i % UInt) - 1 < (lastindex(seq) % UInt) ? (@inbounds seq[i], i + 1) : nothing end +lastbitindex(x::BioSequence) = bitindex(x, lastindex(x)) +firstbitindex(x::BioSequence) = bitindex(x, firstindex(x)) + ## Bounds checking function Base.checkbounds(x::BioSequence, i::Integer) firstindex(x) ≤ i ≤ lastindex(x) || throw(BoundsError(x, i)) diff --git a/src/bit-manipulation/bit-manipulation.jl b/src/bit-manipulation/bit-manipulation.jl index 253bdfb0..f1d2f8ff 100644 --- a/src/bit-manipulation/bit-manipulation.jl +++ b/src/bit-manipulation/bit-manipulation.jl @@ -1,6 +1,4 @@ -include("bitindex.jl") - const BitUnsigned = Union{UInt8, UInt16, UInt32, UInt64, UInt128} @inline function reversebits(x::T, ::BitsPerSymbol{2}) where T <: BitUnsigned