import string
import collections
import pylab
__all__ = ['Sequence']
[docs]class Sequence(object):
"""Common data structure to all sequences (e.g., :meth:`~biokit.sequence.dna.DNA`)
A sequence is a string contained in the :attr:`_data`. If you manipulate this
attribute, you should also changed the :attr:`_N` (length of the string) and
set :attr:`_counter` to None.
Sequences can be concatenated easily. You can also add a string or numpy
array or pandas time series to an existing sequence::
d1 = Sequence('ACGT')
d2 = Sequence('ACGT')
Note that there is a :meth:`check` method, which is not called during the
instanciation but is called when adding sequences together. Each type of
sequence (e.g., Sequence, DNA, RNA) has its own symbols. So you cannot add
a DNA sequence with a RNA sequence for instance. Those are valid operation::
>>> d1 = Sequence('ACGT')
>>> d1 += 'AAAA'
>>> d1 + d1
>>> "AAAA" + d1
"""
def __init__(self, data=''):
# initialise before filling _data attribute
self._checked = False
if isinstance(data, str):
self._data = data
elif isinstance(data, Sequence):
self._data = data._data
self._checked = data._checked
elif self._looks_like_a_sequence(data) is True:
self._data = data._data
self._checked = data._checked
else:
# assume it is a list or numpy array or pandas TimeSeries
self._data = "".join(data)
self._N = len(self._data)
self._counter = None
try:
#python2
self.symbols = string.punctuation + string.letters
except:
# python3
self.symbols = string.punctuation + string.ascii_letters
self._type = 'Sequence'
def _looks_like_a_sequence(self, this):
# if it looks like a sequence, let us assume it is a sequence
if hasattr(this, 'symbols') and hasattr(this, '_data') and\
hasattr(this, '_checked'):
return True
else:
return False
def _get_N(self):
return self._N
N = property(_get_N)
def __len__(self):
return self._N
def _get_sequence(self):
return self._data[:]
sequence = property(_get_sequence,
doc="returns a copy of the sequence")
def _get_count(self):
if self._counter is None:
self._counter = collections.Counter(self._data)
return self._counter
counter = property(_get_count, doc="return counter of the letters")
[docs] def histogram(self):
pylab.clf()
import pandas as pd
pd.Series(self.counter).plot(kind='bar')
[docs] def pie(self):
pylab.clf()
keys = self.counter.keys()
labels = dict([(k,float(self.counter[k])/len(self)) for k in keys])
pylab.pie([self.counter[k] for k in keys], labels=[k +
':'+str(labels[k]) for k in keys])
[docs] def hamming_distance(self, other):
"""Return hamming distance between this sequence and another sequence
The Hamming distance between s and t, denoted dH(s,t), is the number of
corresponding symbols that differ in s and t.
::
>>> d1 = 'GAGCCTACTAACGGGAT'
>>> d2 = 'CATCGTAATGACGGCCT'
>>> s = Sequence(d1)
>>> s.hamming_distance(d2)
7
"""
# TODO:: convert to appropriate sequence.
return sum(1 for x,y in zip(self._data, other._data) if x!=y)
[docs] def upper(self):
"""convertes sequence string to uppercase (inplace)"""
self._data = self._data.upper()
[docs] def lower(self):
"""convertes sequence string to lowercase (inplace)"""
self._data = self._data.lower()
def _check_sequence(self):
"""checks that characters are valid symbols"""
for i, x in enumerate(self._data):
if x not in self.symbols:
raise ValueError("found invalid symbol %s at position %s" % (x,i))
self._checked = True
def __repr__(self):
if self._N > 10:
return "%s: %s ... (length %s) " % (self._type, self.sequence[0:10],
self._N)
else:
return "%s: %s (length %s) " % (self._type, self.sequence, self._N)
def __str__(self):
if self._N > 10:
return "%s: %s ... (length %s) " % (self._type,self.sequence[0:10],
self._N)
else:
return "%s: %s (length %s) " % (self._type, self.sequence, self._N)
def __convert_to_compat(self, other):
from biokit.sequence.rna import RNA
from biokit.sequence.dna import DNA
if isinstance(self, RNA):
other = RNA(other)
elif isinstance(self, DNA):
other = DNA(other)
elif isinstance(self, Sequence):
other = Sequence(other)
# if self._check is True:
# other._check_sequence()
return other
def __add__(self, other):
# input may be a string or list, in which case we need to convert to a sequence
if isinstance(other, Sequence) is False:
other = self.__convert_to_compat(other)
elif type(other) != type(self):
raise TypeError('incompatible sequences %s versus %s' % (type(other), type(self)))
# now let us add the 2 sequences
return self.__convert_to_compat(self._data + other._data)
def __radd__(self, other):
"""operator other + self"""
if isinstance(other, Sequence) is False:
other = self.__convert_to_compat(other)
elif type(other) != type(self):
raise TypeError('incompatible sequences %s versus %s' % (type(other), type(self)))
# now let us add the 2 sequences
return self.__convert_to_compat(other._data + self._data)
def __iadd__(self, other):
if isinstance(other, Sequence) is False:
other = self.__convert_to_compat(other)
elif type(other) != type(self):
raise TypeError('incompatible sequences %s versus %s' % (type(other), type(self)))
# now let us add the 2 sequences
self._data += other._data
self._N = self._N + other._N
return self
def __eq__(self, other):
if isinstance(other, str):
return self._data == other
else:
#assume this is a sequence:
return self._data == other._data