# Copyright 2004 by James Casbon.  All rights reserved.
# This code is part of the Biopython distribution and governed by its
# license.  Please see the LICENSE file that should have been included
# as part of this package.

"""
Code to deal with COMPASS output, a program for profile/profile comparison.

Compass is described in:

Sadreyev R, Grishin N. COMPASS: a tool for comparison of multiple protein
alignments with assessment of statistical significance. J Mol Biol. 2003 Feb
7;326(1):317-36.

Tested with COMPASS 1.24.

Functions:
read          Reads a COMPASS file containing one COMPASS record
parse         Iterates over records in a COMPASS file.

Classes:
Record        One result of a COMPASS file
"""
import re


def read(handle):
    record = None
    try:
        line = next(handle)
        record = Record()
        __read_names(record, line)
        line = next(handle)
        __read_threshold(record, line)
        line = next(handle)
        __read_lengths(record, line)
        line = next(handle)
        __read_profilewidth(record, line)
        line = next(handle)
        __read_scores(record, line)
    except StopIteration:
        if not record:
            raise ValueError("No record found in handle")
        else:
            raise ValueError("Unexpected end of stream.")
    for line in handle:
        if not line.strip(): # skip empty lines
            continue
        __read_query_alignment(record, line)
        try:
            line = next(handle)
            __read_positive_alignment(record, line)
            line = next(handle)
            __read_hit_alignment(record, line)
        except StopIteration:
            raise ValueError("Unexpected end of stream.")
    return record


def parse(handle):
    record = None
    try:
        line = next(handle)
    except StopIteration:
        return
    while True:
        try:
            record = Record()
            __read_names(record, line)
            line = next(handle)
            __read_threshold(record, line)
            line = next(handle)
            __read_lengths(record, line)
            line = next(handle)
            __read_profilewidth(record, line)
            line = next(handle)
            __read_scores(record, line)
        except StopIteration:
            raise ValueError("Unexpected end of stream.")
        for line in handle:
            if not line.strip():
                continue
            if "Ali1:" in line:
                yield record
                break
            __read_query_alignment(record, line)
            try:
                line = next(handle)
                __read_positive_alignment(record, line)
                line = next(handle)
                __read_hit_alignment(record, line)
            except StopIteration:
                raise ValueError("Unexpected end of stream.")
        else:
            yield record
            break


class Record(object):
    """
    Hold information from one compass hit.
    Ali1 is the query, Ali2 the hit.
    """

    def __init__(self):
        self.query=''
        self.hit=''
        self.gap_threshold=0
        self.query_length=0
        self.query_filtered_length=0
        self.query_nseqs=0
        self.query_neffseqs=0
        self.hit_length=0
        self.hit_filtered_length=0
        self.hit_nseqs=0
        self.hit_neffseqs=0
        self.sw_score=0
        self.evalue=-1
        self.query_start=-1
        self.hit_start=-1
        self.query_aln=''
        self.hit_aln=''
        self.positives=''

    def query_coverage(self):
        """Return the length of the query covered in the alignment."""
        s = self.query_aln.replace("=", "")
        return len(s)

    def hit_coverage(self):
        """Return the length of the hit covered in the alignment."""
        s = self.hit_aln.replace("=", "")
        return len(s)

# Everything below is private

__regex = {"names": re.compile("Ali1:\s+(\S+)\s+Ali2:\s+(\S+)\s+"),
           "threshold": re.compile("Threshold of effective gap content in columns: (\S+)"),
           "lengths": re.compile("length1=(\S+)\s+filtered_length1=(\S+)\s+length2=(\S+)\s+filtered_length2=(\S+)"),
           "profilewidth": re.compile("Nseqs1=(\S+)\s+Neff1=(\S+)\s+Nseqs2=(\S+)\s+Neff2=(\S+)"),
           "scores": re.compile("Smith-Waterman score = (\S+)\s+Evalue = (\S+)"),
           "start": re.compile("(\d+)"),
           "align": re.compile("^.{15}(\S+)"),
           "positive_alignment": re.compile("^.{15}(.+)"),
          }


def __read_names(record, line):
    """
    Ali1: 60456.blo.gz.aln  Ali2: allscop//14984.blo.gz.aln
          ------query-----        -------hit-------------
    """
    if not "Ali1:" in line:
        raise ValueError("Line does not contain 'Ali1:':\n%s" % line)
    m = __regex["names"].search(line)
    record.query = m.group(1)
    record.hit = m.group(2)


def __read_threshold(record, line):
    if not line.startswith("Threshold"):
        raise ValueError("Line does not start with 'Threshold':\n%s" % line)
    m = __regex["threshold"].search(line)
    record.gap_threshold = float(m.group(1))


def __read_lengths(record, line):
    if not line.startswith("length1="):
        raise ValueError("Line does not start with 'length1=':\n%s" % line)
    m = __regex["lengths"].search(line)
    record.query_length = int(m.group(1))
    record.query_filtered_length = float(m.group(2))
    record.hit_length = int(m.group(3))
    record.hit_filtered_length = float(m.group(4))


def __read_profilewidth(record, line):
    if not "Nseqs1" in line:
        raise ValueError("Line does not contain 'Nseqs1':\n%s" % line)
    m = __regex["profilewidth"].search(line)
    record.query_nseqs = int(m.group(1))
    record.query_neffseqs = float(m.group(2))
    record.hit_nseqs = int(m.group(3))
    record.hit_neffseqs = float(m.group(4))


def __read_scores(record, line):
    if not line.startswith("Smith-Waterman"):
        raise ValueError("Line does not start with 'Smith-Waterman':\n%s" % line)
    m = __regex["scores"].search(line)
    if m:
        record.sw_score = int(m.group(1))
        record.evalue = float(m.group(2))
    else:
        record.sw_score = 0
        record.evalue = -1.0


def __read_query_alignment(record, line):
    m = __regex["start"].search(line)
    if m:
        record.query_start = int(m.group(1))
    m = __regex["align"].match(line)
    assert m is not None, "invalid match"
    record.query_aln += m.group(1)


def __read_positive_alignment(record, line):
    m = __regex["positive_alignment"].match(line)
    assert m is not None, "invalid match"
    record.positives += m.group(1)


def __read_hit_alignment(record, line):
    m = __regex["start"].search(line)
    if m:
        record.hit_start = int(m.group(1))
    m = __regex["align"].match(line)
    assert m is not None, "invalid match"
    record.hit_aln += m.group(1)