# Copyright 2012 by Wibowo Arindrarto. All rights reserved. # This code is part of the Biopython distribution and governed by its # license. Please see the LICENSE file that should have been included # as part of this package. """Bio.SearchIO object to model a single database hit.""" from __future__ import print_function from itertools import chain from Bio._py3k import filter from Bio._utils import getattr_str, trim_str from Bio.SearchIO._utils import allitems, optionalcascade from ._base import _BaseSearchObject from .hsp import HSP class Hit(_BaseSearchObject): """Class representing a single database hit of a search result. Hit objects are the second-level container in the SearchIO module. They are the objects contained within a QueryResult (see QueryResult). They themselves are container for HSP objects and will contain at least one HSP. To have a quick look at a Hit and its contents, invoke `print` on it: >>> from Bio import SearchIO >>> qresult = next(SearchIO.parse('Blast/mirna.xml', 'blast-xml')) >>> hit = qresult[3] >>> print(hit) Query: 33211 mir_1 Hit: gi|301171322|ref|NR_035857.1| (86) Pan troglodytes microRNA mir-520c (MIR520C), microRNA HSPs: ---- -------- --------- ------ --------------- --------------------- # E-value Bit score Span Query range Hit range ---- -------- --------- ------ --------------- --------------------- 0 8.9e-20 100.47 60 [1:61] [13:73] 1 3.3e-06 55.39 60 [0:60] [13:73] You can invoke `len` on a Hit object to see how many HSP objects it contains: >>> len(hit) 2 Hit objects behave very similar to Python lists. You can retrieve the HSP object inside a Hit using the HSP's integer index. Hit objects can also be sliced, which will return a new Hit objects containing only the sliced HSPs: # HSP items inside the Hit can be retrieved using its integer index >>> hit[0] HSP(hit_id='gi|301171322|ref|NR_035857.1|', query_id='33211', 1 fragments) # slicing returns a new Hit >>> hit Hit(id='gi|301171322|ref|NR_035857.1|', query_id='33211', 2 hsps) >>> hit[:1] Hit(id='gi|301171322|ref|NR_035857.1|', query_id='33211', 1 hsps) >>> print(hit[1:]) Query: 33211 mir_1 Hit: gi|301171322|ref|NR_035857.1| (86) Pan troglodytes microRNA mir-520c (MIR520C), microRNA HSPs: ---- -------- --------- ------ --------------- --------------------- # E-value Bit score Span Query range Hit range ---- -------- --------- ------ --------------- --------------------- 0 3.3e-06 55.39 60 [0:60] [13:73] Hit objects provide `filter` and `map` methods, which are analogous to Python's built-in `filter` and `map` except that they return a new Hit object instead of a list. Here is an example of using `filter` to select for HSPs whose e-value is less than 1e-10: >>> evalue_filter = lambda hsp: hsp.evalue < 1e-10 >>> filtered_hit = hit.filter(evalue_filter) >>> len(hit) 2 >>> len(filtered_hit) 1 >>> print(filtered_hit) Query: 33211 mir_1 Hit: gi|301171322|ref|NR_035857.1| (86) Pan troglodytes microRNA mir-520c (MIR520C), microRNA HSPs: ---- -------- --------- ------ --------------- --------------------- # E-value Bit score Span Query range Hit range ---- -------- --------- ------ --------------- --------------------- 0 8.9e-20 100.47 60 [1:61] [13:73] There are also other methods which are counterparts of Python lists' methods with the same names: `append`, `index`, `pop`, and `sort`. Consult their respective documentations for more details and examples of their usage. """ # attributes we don't want to transfer when creating a new Hit class # from this one _NON_STICKY_ATTRS = ('_items', ) def __init__(self, hsps=[], id=None, query_id=None): """Initializes a Hit object. Arguments: hsps -- List containing HSP objects. id -- String of the Hit ID query_id -- String of the Hit's query ID If multiple HSP objects are used for initialization, they must all have the same `query_id`, `query_description`, `hit_id`, and `hit_description` properties. """ # default attribute values self._id = id self._query_id = query_id self._description = None self._query_description = None for attr in ('query_id', 'query_description', 'hit_id', 'hit_description'): # HACK: setting the if clause to '> 1' allows for empty hit objects. # This makes it easier to work with file formats with unpredictable # hit-hsp ordering. The empty hit object itself is nonfunctional, # however, since all its cascading properties are empty. if len(set(getattr(hsp, attr) for hsp in hsps)) > 1: raise ValueError("Hit object can not contain HSPs with " "more than one %s." % attr) self._items = [] for hsp in hsps: # validate each HSP self._validate_hsp(hsp) # and store it them as an instance attribute self.append(hsp) def __repr__(self): return "Hit(id=%r, query_id=%r, %r hsps)" % (self.id, self.query_id, len(self)) def __iter__(self): return iter(self.hsps) def __len__(self): return len(self.hsps) #Python 3: def __bool__(self): return bool(self.hsps) #Python 2: __nonzero__= __bool__ def __contains__(self, hsp): return hsp in self._items def __str__(self): lines = [] # set query id line qid_line = 'Query: %s' % self.query_id if self.query_description: qid_line += trim_str('\n %s' % self.query_description, 80, '...') lines.append(qid_line) # set hit id line hid_line = ' Hit: %s' % self.id if hasattr(self, 'seq_len'): hid_line += ' (%i)' % self.seq_len if self.description: hid_line += trim_str('\n %s' % self.description, 80, '...') lines.append(hid_line) # set hsp line and table if not self.hsps: lines.append(' HSPs: ?') else: lines.append(' HSPs: %s %s %s %s %s %s' % ('-'*4, '-'*8, '-'*9, '-'*6, '-'*15, '-'*21)) pattern = '%11s %8s %9s %6s %15s %21s' lines.append(pattern % ('#', 'E-value', 'Bit score', 'Span', 'Query range', 'Hit range')) lines.append(pattern % ('-'*4, '-'*8, '-'*9, '-'*6, '-'*15, '-'*21)) for idx, hsp in enumerate(self.hsps): # evalue evalue = getattr_str(hsp, 'evalue', fmt='%.2g') # bitscore bitscore = getattr_str(hsp, 'bitscore', fmt='%.2f') # alignment length aln_span = getattr_str(hsp, 'aln_span') # query region query_start = getattr_str(hsp, 'query_start') query_end = getattr_str(hsp, 'query_end') query_range = '[%s:%s]' % (query_start, query_end) # max column length is 18 query_range = trim_str(query_range, 15, '~]') # hit region hit_start = getattr_str(hsp, 'hit_start') hit_end = getattr_str(hsp, 'hit_end') hit_range = '[%s:%s]' % (hit_start, hit_end) hit_range = trim_str(hit_range, 21, '~]') # append the hsp row lines.append(pattern % (str(idx), evalue, bitscore, aln_span, query_range, hit_range)) return '\n'.join(lines) def __getitem__(self, idx): # if key is slice, return a new Hit instance if isinstance(idx, slice): obj = self.__class__(self.hsps[idx]) self._transfer_attrs(obj) return obj return self._items[idx] def __setitem__(self, idx, hsps): # handle case if hsps is a list of hsp if isinstance(hsps, (list, tuple)): for hsp in hsps: self._validate_hsp(hsp) else: self._validate_hsp(hsps) self._items[idx] = hsps def __delitem__(self, idx): del self._items[idx] ## hsp properties ## def _validate_hsp(self, hsp): """Validates an HSP object. Valid HSP objects have the same hit_id as the Hit object ID and the same query_id as the Hit object's query_id. """ if not isinstance(hsp, HSP): raise TypeError("Hit objects can only contain HSP objects.") # HACK: to make validation during __init__ work if self._items: if self.id is not None: if hsp.hit_id != self.id: raise ValueError("Expected HSP with hit ID %r, " "found %r instead." % (self.id, hsp.hit_id)) else: self.id = hsp.hit_id if self.description is not None: if hsp.hit_description != self.description: raise ValueError("Expected HSP with hit description %r, " "found %r instead." % (self.description, hsp.hit_description)) else: self.description = hsp.hit_description if self.query_id is not None: if hsp.query_id != self.query_id: raise ValueError("Expected HSP with query ID %r, " "found %r instead." % (self.query_id, hsp.query_id)) else: self.query_id = hsp.query_id if self.query_description is not None: if hsp.query_description != self.query_description: raise ValueError("Expected HSP with query description %r, " "found %r instead." % (self.query_description, hsp.query_description)) else: self.query_description = hsp.query_description ## properties ## description = optionalcascade('_description', 'hit_description', """Hit description""") query_description = optionalcascade('_query_description', 'query_description', """Description of the query that produced the hit""") id = optionalcascade('_id', 'hit_id', """Hit ID string.""") query_id = optionalcascade('_query_id', 'query_id', """ID string of the query that produced the hit""") # returns all hsps hsps = allitems(doc="""HSP objects contained in the Hit""") @property def fragments(self): """HSPFragment objects contained in the Hit""" return [frag for frag in chain(*self._items)] ## public methods ## def append(self, hsp): """Adds a HSP object to the end of Hit. Parameters hsp -- HSP object to append. Any HSP object appended must have the same `hit_id` property as the Hit object's `id` property and the same `query_id` property as the Hit object's `query_id` property. """ self._validate_hsp(hsp) self._items.append(hsp) def filter(self, func=None): """Creates a new Hit object whose HSP objects pass the filter function. Arguments: func -- Callback function that accepts a HSP object as its parameter, does a boolean check, and returns True or False. `filter` is analogous to Python's built-in `filter` function, except that instead of returning a list it returns a `Hit` object. Here is an example of using `filter` to select for HSPs having bitscores bigger than 60: >>> from Bio import SearchIO >>> qresult = next(SearchIO.parse('Blast/mirna.xml', 'blast-xml')) >>> hit = qresult[3] >>> evalue_filter = lambda hsp: hsp.bitscore > 60 >>> filtered_hit = hit.filter(evalue_filter) >>> len(hit) 2 >>> len(filtered_hit) 1 >>> print(filtered_hit) Query: 33211 mir_1 Hit: gi|301171322|ref|NR_035857.1| (86) Pan troglodytes microRNA mir-520c (MIR520C), microRNA HSPs: ---- -------- --------- ------ --------------- --------------------- # E-value Bit score Span Query range Hit range ---- -------- --------- ------ --------------- --------------------- 0 8.9e-20 100.47 60 [1:61] [13:73] """ hsps = list(filter(func, self.hsps)) if hsps: obj = self.__class__(hsps) self._transfer_attrs(obj) return obj def index(self, hsp): """Returns the index of a given HSP object, zero-based. Arguments: hsp -- HSP object to be looked up. """ return self._items.index(hsp) def map(self, func=None): """Creates a new Hit object, mapping the given function to its HSPs. Arguments: func -- Callback function that accepts a HSP object as its parameter and also returns a HSP object. `map` is analogous to Python's built-in `map` function. It is applied to all HSPs contained in the Hit object and returns a new Hit object. """ if func is not None: hsps = [func(x) for x in self.hsps[:]] # this creates a shallow copy else: hsps = self.hsps[:] if hsps: obj = self.__class__(hsps) self._transfer_attrs(obj) return obj def pop(self, index=-1): """Removes and returns the HSP object at the specified index. Arguments: index -- Integer denoting the index of the HSP object to remove. """ return self._items.pop(index) def sort(self, key=None, reverse=False, in_place=True): """Sorts the HSP objects. Arguments: key -- Function used to sort the HSP objects. reverse -- Boolean, whether to reverse the sorting or not. in_place -- Boolean, whether to perform sorting in place (in the same object) or not (creating a new object). `sort` defaults to sorting in-place, to mimick Python's `list.sort` method. If you set the `in_place` argument to False, it will treat return a new, sorted Hit object and keep the initial one unsorted """ if in_place: self._items.sort(key=key, reverse=reverse) else: hsps = self.hsps[:] hsps.sort(key=key, reverse=reverse) obj = self.__class__(hsps) self._transfer_attrs(obj) return obj # if not used as a module, run the doctest if __name__ == "__main__": from Bio._utils import run_doctest run_doctest()