# Copyright 2012 by Wibowo Arindrarto. All rights reserved. # This file is part of the Biopython distribution and governed by your # choice of the "Biopython License Agreement" or the "BSD 3-Clause License". # Please see the LICENSE file that should have been included as part of this # package. """Bio.SearchIO parser for Exonerate cigar output format.""" import re from Bio._py3k import _bytes_to_string from ._base import _BaseExonerateParser, _STRAND_MAP from .exonerate_vulgar import ExonerateVulgarIndexer __all__ = ("ExonerateCigarParser", "ExonerateCigarIndexer") # precompile regex _RE_CIGAR = re.compile( r"""^cigar:\s+ (\S+)\s+(\d+)\s+(\d+)\s+([\+-\.])\s+ # query: ID, start, end, strand (\S+)\s+(\d+)\s+(\d+)\s+([\+-\.])\s+ # hit: ID, start, end, strand (\d+)(\s+.*)$ # score, vulgar components """, re.VERBOSE, ) class ExonerateCigarParser(_BaseExonerateParser): """Parser for Exonerate cigar strings.""" _ALN_MARK = "cigar" def parse_alignment_block(self, header): """Parse alignment block for cigar format, return query results, hits, hsps.""" qresult = header["qresult"] hit = header["hit"] hsp = header["hsp"] self.read_until(lambda line: line.startswith("cigar")) cigars = re.search(_RE_CIGAR, self.line) # if the file has c4 alignments # check if cigar values match our previously parsed header values if self.has_c4_alignment: assert qresult["id"] == cigars.group(1) assert hsp["query_start"] == cigars.group(2) assert hsp["query_end"] == cigars.group(3) assert hsp["query_strand"] == cigars.group(4) assert hit["id"] == cigars.group(5) assert hsp["hit_start"] == cigars.group(6) assert hsp["hit_end"] == cigars.group(7) assert hsp["hit_strand"] == cigars.group(8) assert hsp["score"] == cigars.group(9) else: qresult["id"] = cigars.group(1) hsp["query_start"] = cigars.group(2) hsp["query_end"] = cigars.group(3) hsp["query_strand"] = cigars.group(4) hit["id"] = cigars.group(5) hsp["hit_start"] = cigars.group(6) hsp["hit_end"] = cigars.group(7) hsp["hit_strand"] = cigars.group(8) hsp["score"] = cigars.group(9) # adjust strands hsp["query_strand"] = _STRAND_MAP[hsp["query_strand"]] hsp["hit_strand"] = _STRAND_MAP[hsp["hit_strand"]] # cast coords into ints qstart = int(hsp["query_start"]) qend = int(hsp["query_end"]) hstart = int(hsp["hit_start"]) hend = int(hsp["hit_end"]) # set coords (start <= end) hsp["query_start"] = min(qstart, qend) hsp["query_end"] = max(qstart, qend) hsp["hit_start"] = min(hstart, hend) hsp["hit_end"] = max(hstart, hend) # cast score into int hsp["score"] = int(hsp["score"]) # store cigar components hsp["cigar_comp"] = cigars.group(10) # HACK: since we can't really figure out exactly when a # HSP starts or ends, we set the entire alignment as one HSP hsp["query_ranges"] = [(hsp["query_start"], hsp["query_end"])] hsp["hit_ranges"] = [(hsp["hit_start"], hsp["hit_end"])] return {"qresult": qresult, "hit": hit, "hsp": hsp} class ExonerateCigarIndexer(ExonerateVulgarIndexer): """Indexer class for exonerate cigar lines.""" _parser = ExonerateCigarParser _query_mark = b"cigar" def get_qresult_id(self, pos): """Return the query ID of the nearest cigar line.""" handle = self._handle handle.seek(pos) # get line, check if it's a vulgar line, and get query ID line = handle.readline() assert line.startswith(self._query_mark), line id = re.search(_RE_CIGAR, _bytes_to_string(line)) return id.group(1) # if not used as a module, run the doctest if __name__ == "__main__": from Bio._utils import run_doctest run_doctest()