# Copyright 1999 by Jeffrey Chang.  All rights reserved.
# Copyright 2000 by Jeffrey Chang.  All rights reserved.
# Revisions Copyright 2007 by Peter Cock.  All rights reserved.
# Revisions Copyright 2009 by Michiel de Hoon.  All rights reserved.
# This code is part of the Biopython distribution and governed by its
# license.  Please see the LICENSE file that should have been included
# as part of this package.
"""Parser for the prosite dat file from Prosite at ExPASy.

See https://www.expasy.org/prosite/

Tested with:
 - Release 20.43, 10-Feb-2009
 - Release 2017_03 of 15-Mar-2017.

Functions:
 - read                  Reads a Prosite file containing one Prosite record
 - parse                 Iterates over records in a Prosite file.

Classes:
 - Record                Holds Prosite data.

"""


def parse(handle):
    """Parse Prosite records.

    This function is for parsing Prosite files containing multiple
    records.

    Arguments:
     - handle   - handle to the file.

    """
    while True:
        record = __read(handle)
        if not record:
            break
        yield record


def read(handle):
    """Read one Prosite record.

    This function is for parsing Prosite files containing
    exactly one record.

    Arguments:
     - handle   - handle to the file.

    """
    record = __read(handle)
    # We should have reached the end of the record by now
    remainder = handle.read()
    if remainder:
        raise ValueError("More than one Prosite record found")
    return record


class Record(object):
    """Holds information from a Prosite record.

    Main attributes:
     - name           ID of the record.  e.g. ADH_ZINC
     - type           Type of entry.  e.g. PATTERN, MATRIX, or RULE
     - accession      e.g. PS00387
     - created        Date the entry was created.  (MMM-YYYY for releases
       before January 2017, DD-MMM-YYYY since January 2017)
     - data_update    Date the 'primary' data was last updated.
     - info_update    Date data other than 'primary' data was last updated.
     - pdoc           ID of the PROSITE DOCumentation.
     - description    Free-format description.
     - pattern        The PROSITE pattern.  See docs.
     - matrix         List of strings that describes a matrix entry.
     - rules          List of rule definitions (from RU lines).  (strings)
     - prorules       List of prorules (from PR lines). (strings)

    NUMERICAL RESULTS:
     - nr_sp_release  SwissProt release.
     - nr_sp_seqs     Number of seqs in that release of Swiss-Prot. (int)
     - nr_total       Number of hits in Swiss-Prot.  tuple of (hits, seqs)
     - nr_positive    True positives.  tuple of (hits, seqs)
     - nr_unknown     Could be positives.  tuple of (hits, seqs)
     - nr_false_pos   False positives.  tuple of (hits, seqs)
     - nr_false_neg   False negatives.  (int)
     - nr_partial     False negatives, because they are fragments. (int)

    COMMENTS:
     - cc_taxo_range  Taxonomic range.  See docs for format
     - cc_max_repeat  Maximum number of repetitions in a protein
     - cc_site        Interesting site.  list of tuples (pattern pos, desc.)
     - cc_skip_flag   Can this entry be ignored?
     - cc_matrix_type
     - cc_scaling_db
     - cc_author
     - cc_ft_key
     - cc_ft_desc
     - cc_version     version number (introduced in release 19.0)

    The following are all lists if tuples (swiss-prot accession, swiss-prot name).

    DATA BANK REFERENCES:
     - dr_positive
     - dr_false_neg
     - dr_false_pos
     - dr_potential   Potential hits, but fingerprint region not yet available.
     - dr_unknown     Could possibly belong
     - pdb_structs    List of PDB entries.

    """

    def __init__(self):
        """Initialize the class."""
        self.name = ""
        self.type = ""
        self.accession = ""
        self.created = ""
        self.data_update = ""
        self.info_update = ""
        self.pdoc = ""

        self.description = ""
        self.pattern = ""
        self.matrix = []
        self.rules = []
        self.prorules = []
        self.postprocessing = []

        self.nr_sp_release = ""
        self.nr_sp_seqs = ""
        self.nr_total = (None, None)
        self.nr_positive = (None, None)
        self.nr_unknown = (None, None)
        self.nr_false_pos = (None, None)
        self.nr_false_neg = None
        self.nr_partial = None

        self.cc_taxo_range = ""
        self.cc_max_repeat = ""
        self.cc_site = []
        self.cc_skip_flag = ""

        self.dr_positive = []
        self.dr_false_neg = []
        self.dr_false_pos = []
        self.dr_potential = []
        self.dr_unknown = []

        self.pdb_structs = []


# Everything below are private functions


def __read(handle):
    import re

    record = None
    for line in handle:
        keyword, value = line[:2], line[5:].rstrip()
        if keyword == "ID":
            record = Record()
            cols = value.split("; ")
            if len(cols) != 2:
                raise ValueError("I don't understand identification line\n%s" % line)
            record.name = cols[0]
            record.type = cols[1].rstrip(".")  # don't want '.'
        elif keyword == "AC":
            record.accession = value.rstrip(";")
        elif keyword == "DT":
            # e.g. from January 2017,
            # DT   01-APR-1990 CREATED; 01-APR-1990 DATA UPDATE; 01-APR-1990 INFO UPDATE.
            # Older files had brackets round the date descriptions and used MMM-YYYY
            dates = value.rstrip(".").split("; ")
            if dates[0].endswith((" (CREATED)", " CREATED")):
                # Remove last word
                record.created = dates[0].rsplit(" ", 1)[0]
            else:
                raise ValueError("I don't understand date line\n%s" % line)
            if dates[1].endswith((" (DATA UPDATE)", " DATA UPDATE")):
                # Remove last two words
                record.data_update = dates[1].rsplit(" ", 2)[0]
            else:
                raise ValueError("I don't understand date line\n%s" % line)
            if dates[2].endswith((" (INFO UPDATE)", " INFO UPDATE")):
                # Remove last two words
                record.info_update = dates[2].rsplit(" ", 2)[0]
            else:
                raise ValueError("I don't understand date line\n%s" % line)
        elif keyword == "DE":
            record.description = value
        elif keyword == "PA":
            record.pattern += value
        elif keyword == "MA":
            record.matrix.append(value)
        elif keyword == "PP":
            record.postprocessing.extend(value.split(";"))
        elif keyword == "RU":
            record.rules.append(value)
        elif keyword == "NR":
            cols = value.split(";")
            for col in cols:
                if not col:
                    continue
                qual, data = [word.lstrip() for word in col.split("=")]
                if qual == "/RELEASE":
                    release, seqs = data.split(",")
                    record.nr_sp_release = release
                    record.nr_sp_seqs = int(seqs)
                elif qual == "/FALSE_NEG":
                    record.nr_false_neg = int(data)
                elif qual == "/PARTIAL":
                    record.nr_partial = int(data)
                elif qual in ["/TOTAL", "/POSITIVE", "/UNKNOWN", "/FALSE_POS"]:
                    m = re.match(r"(\d+)\((\d+)\)", data)
                    if not m:
                        raise Exception(
                            "Broken data %s in comment line\n%s" % (repr(data), line)
                        )
                    hits = tuple(map(int, m.groups()))
                    if qual == "/TOTAL":
                        record.nr_total = hits
                    elif qual == "/POSITIVE":
                        record.nr_positive = hits
                    elif qual == "/UNKNOWN":
                        record.nr_unknown = hits
                    elif qual == "/FALSE_POS":
                        record.nr_false_pos = hits
                else:
                    raise ValueError(
                        "Unknown qual %s in comment line\n%s" % (repr(qual), line)
                    )
        elif keyword == "CC":
            # Expect CC lines like this:
            # CC   /TAXO-RANGE=??EPV; /MAX-REPEAT=2;
            # Can (normally) split on ";" and then on "="
            cols = value.split(";")
            for col in cols:
                if not col or col[:17] == "Automatic scaling":
                    # DNAJ_2 in Release 15 has a non-standard comment line:
                    # CC   Automatic scaling using reversed database
                    # Throw it away.  (Should I keep it?)
                    continue
                if col.count("=") == 0:
                    # Missing qualifier!  Can we recover gracefully?
                    # For example, from Bug 2403, in PS50293 have:
                    # CC /AUTHOR=K_Hofmann; N_Hulo
                    continue
                qual, data = [word.lstrip() for word in col.split("=")]
                if qual == "/TAXO-RANGE":
                    record.cc_taxo_range = data
                elif qual == "/MAX-REPEAT":
                    record.cc_max_repeat = data
                elif qual == "/SITE":
                    pos, desc = data.split(",")
                    record.cc_site.append((int(pos), desc))
                elif qual == "/SKIP-FLAG":
                    record.cc_skip_flag = data
                elif qual == "/MATRIX_TYPE":
                    record.cc_matrix_type = data
                elif qual == "/SCALING_DB":
                    record.cc_scaling_db = data
                elif qual == "/AUTHOR":
                    record.cc_author = data
                elif qual == "/FT_KEY":
                    record.cc_ft_key = data
                elif qual == "/FT_DESC":
                    record.cc_ft_desc = data
                elif qual == "/VERSION":
                    record.cc_version = data
                else:
                    raise ValueError(
                        "Unknown qual %s in comment line\n%s" % (repr(qual), line)
                    )
        elif keyword == "DR":
            refs = value.split(";")
            for ref in refs:
                if not ref:
                    continue
                acc, name, type = [word.strip() for word in ref.split(",")]
                if type == "T":
                    record.dr_positive.append((acc, name))
                elif type == "F":
                    record.dr_false_pos.append((acc, name))
                elif type == "N":
                    record.dr_false_neg.append((acc, name))
                elif type == "P":
                    record.dr_potential.append((acc, name))
                elif type == "?":
                    record.dr_unknown.append((acc, name))
                else:
                    raise ValueError("I don't understand type flag %s" % type)
        elif keyword == "3D":
            cols = value.split()
            for id in cols:
                record.pdb_structs.append(id.rstrip(";"))
        elif keyword == "PR":
            rules = value.split(";")
            record.prorules.extend(rules)
        elif keyword == "DO":
            record.pdoc = value.rstrip(";")
        elif keyword == "CC":
            continue
        elif keyword == "//":
            if not record:
                # Then this was the copyright statement
                continue
            break
        else:
            raise ValueError("Unknown keyword %s found" % keyword)
    else:
        return
    if not record:
        raise ValueError("Unexpected end of stream.")
    return record