-
Notifications
You must be signed in to change notification settings - Fork 48
Open
Labels
feature requestA request for some desired or missing functionalityA request for some desired or missing functionality
Description
Sometimes it makes sense to begin indexing a sequence from its center. For example with hyper-variable loop regions for T cell / B cell receptors or peptides presented by antigen presenting cells. These are short amino acid sequences (lengths<20) whose start and end sequences are structured and center sequences are highly variable. Therefore I propose a CenteredSequence{A}
type
Expected Behavior
- Centered indexing mimicking behavior of OffsetArrays.jl
- Return
AA_Gap
instead of out of bounds error, making it easier to align sequences of variable lengths
x = CenteredSequence{AminoAcidAlphabet}("CASSLASGSTGELFF")
x[0] # AA_G
x[-3:3] # LASGSTG
x[0:9] # GSTGELFF--
Possible Implementation
struct CenteredSequence{A <: Alphabet} <: BioSequence{A}
data::Vector{UInt64}
len::UInt
function CenteredSequence{A}(data::Vector{UInt64}, len::UInt) where {A <: Alphabet}
new{A}(data, len)
end
end
function CenteredSequence{A}(it) where {A <: Alphabet}
x = LongSequence{A}(it)
CenteredSequence{A}(x.data, x.len % UInt)
end
function CenteredSequence{A}(::UndefInitializer, len::Integer) where {A<:Alphabet}
if len < 0
throw(ArgumentError("len must be non-negative"))
end
return CenteredSequence{A}('-'^len)
end
Base.length(x::CenteredSequence) = x.len % Int
BioSequences.encoded_data_eltype(::Type{<:CenteredSequence}) = UInt64
Base.copy(x::CenteredSequence) = typeof(x)(copy(x.data), x.len)
Base.eachindex(x::CenteredSequence) = range(firstindex(x),lastindex(x))
Base.firstindex(x::CenteredSequence) = 1 - trunc(Int,ceil(length(x)/2))
Base.lastindex(x::CenteredSequence) = trunc(Int,length(x)/2)
@inline function BioSequences.extract_encoded_element(x::CenteredSequence, i::Integer)
bi = BioSequences.bitindex(x, i % UInt)
BioSequences.extract_encoded_element(bi, x.data)
end
@inline function BioSequences.bitindex(x::CenteredSequence, i::Integer)
N = BioSequences.BitsPerSymbol(Alphabet(typeof(x)))
BioSequences.bitindex(N, BioSequences.encoded_data_eltype(typeof(x)), i)
end
Base.@propagate_inbounds function Base.getindex(x::CenteredSequence, i::Integer)
firstindex(x) ≤ i ≤ lastindex(x) || return gap(eltype(x))
data = BioSequences.extract_encoded_element(x, i+1-firstindex(x) )
return BioSequences.decode(Alphabet(x), data)
end
Base.@propagate_inbounds function Base.getindex(x::CenteredSequence, idx::AbstractVector{<:Integer})
isempty(idx) && return empty(x)
v = map(i->x[i],idx)
return typeof(x)(v)
end
Base.@propagate_inbounds function Base.getindex(x::CenteredSequence, bools::AbstractVector{Bool})
@boundscheck checkbounds(x, bools)
v = map(i->x[i], eachindex(x)[bools])
return typeof(x)(v)
end
@assert BioSequences.has_interface(BioSequence, CenteredSequence{AminoAcidAlphabet}, [alphabet(AminoAcid)...], false)
However there seem to be issues with this implementation
Edit: fixed in BioSequence v3.0.2
x = rand(["WQERWER","PIOUIOP","CASSAKL"],12)
length(unique(x)) # 3 as expected
x = map(LongSequence{AminoAcidAlphabet},x)
length(unique(x)) # 3 as expected
x = map(CenteredSequence{AminoAcidAlphabet},x)
length(unique(x)) # unexpected behvaiour
Metadata
Metadata
Assignees
Labels
feature requestA request for some desired or missing functionalityA request for some desired or missing functionality