# Copyright 1999 by Jeffrey Chang.  All rights reserved.
# Copyright 2000 by Jeffrey Chang.  All rights reserved.
# Revisions Copyright 2007 by Peter Cock.  All rights reserved.
# Revisions Copyright 2009 by Michiel de Hoon.  All rights reserved.
# This code is part of the Biopython distribution and governed by its
# license.  Please see the LICENSE file that should have been included
# as part of this package.
"""Parser for the prosite dat file from Prosite at ExPASy.

See https://www.expasy.org/prosite/

Tested with:
 - Release 20.43, 10-Feb-2009
 - Release 2017_03 of 15-Mar-2017.

Functions:
 - read                  Reads a Prosite file containing one Prosite record
 - parse                 Iterates over records in a Prosite file.

Classes:
 - Record                Holds Prosite data.

"""


def parse(handle):
    """Parse Prosite records.

    This function is for parsing Prosite files containing multiple
    records.

    Arguments:
     - handle   - handle to the file.

    """
    while True:
        record = __read(handle)
        if not record:
            break
        yield record


def read(handle):
    """Read one Prosite record.

    This function is for parsing Prosite files containing
    exactly one record.

    Arguments:
     - handle   - handle to the file.

    """
    record = __read(handle)
    # We should have reached the end of the record by now
    remainder = handle.read()
    if remainder:
        raise ValueError("More than one Prosite record found")
    return record


class Record(object):
    """Holds information from a Prosite record.

    Main attributes:
     - name           ID of the record.  e.g. ADH_ZINC
     - type           Type of entry.  e.g. PATTERN, MATRIX, or RULE
     - accession      e.g. PS00387
     - created        Date the entry was created.  (MMM-YYYY for releases
       before January 2017, DD-MMM-YYYY since January 2017)
     - data_update    Date the 'primary' data was last updated.
     - info_update    Date data other than 'primary' data was last updated.
     - pdoc           ID of the PROSITE DOCumentation.
     - description    Free-format description.
     - pattern        The PROSITE pattern.  See docs.
     - matrix         List of strings that describes a matrix entry.
     - rules          List of rule definitions (from RU lines).  (strings)
     - prorules       List of prorules (from PR lines). (strings)

    NUMERICAL RESULTS:
     - nr_sp_release  SwissProt release.
     - nr_sp_seqs     Number of seqs in that release of Swiss-Prot. (int)
     - nr_total       Number of hits in Swiss-Prot.  tuple of (hits, seqs)
     - nr_positive    True positives.  tuple of (hits, seqs)
     - nr_unknown     Could be positives.  tuple of (hits, seqs)
     - nr_false_pos   False positives.  tuple of (hits, seqs)
     - nr_false_neg   False negatives.  (int)
     - nr_partial     False negatives, because they are fragments. (int)

    COMMENTS:
     - cc_taxo_range  Taxonomic range.  See docs for format
     - cc_max_repeat  Maximum number of repetitions in a protein
     - cc_site        Interesting site.  list of tuples (pattern pos, desc.)
     - cc_skip_flag   Can this entry be ignored?
     - cc_matrix_type
     - cc_scaling_db
     - cc_author
     - cc_ft_key
     - cc_ft_desc
     - cc_version     version number (introduced in release 19.0)

    The following are all lists if tuples (swiss-prot accession, swiss-prot name).

    DATA BANK REFERENCES:
     - dr_positive
     - dr_false_neg
     - dr_false_pos
     - dr_potential   Potential hits, but fingerprint region not yet available.
     - dr_unknown     Could possibly belong
     - pdb_structs    List of PDB entries.

    """

    def __init__(self):
        """Initialize the class."""
        self.name = ''
        self.type = ''
        self.accession = ''
        self.created = ''
        self.data_update = ''
        self.info_update = ''
        self.pdoc = ''

        self.description = ''
        self.pattern = ''
        self.matrix = []
        self.rules = []
        self.prorules = []
        self.postprocessing = []

        self.nr_sp_release = ''
        self.nr_sp_seqs = ''
        self.nr_total = (None, None)
        self.nr_positive = (None, None)
        self.nr_unknown = (None, None)
        self.nr_false_pos = (None, None)
        self.nr_false_neg = None
        self.nr_partial = None

        self.cc_taxo_range = ''
        self.cc_max_repeat = ''
        self.cc_site = []
        self.cc_skip_flag = ''

        self.dr_positive = []
        self.dr_false_neg = []
        self.dr_false_pos = []
        self.dr_potential = []
        self.dr_unknown = []

        self.pdb_structs = []


# Everything below are private functions

def __read(handle):
    import re
    record = None
    for line in handle:
        keyword, value = line[:2], line[5:].rstrip()
        if keyword == 'ID':
            record = Record()
            cols = value.split("; ")
            if len(cols) != 2:
                raise ValueError("I don't understand identification line\n%s"
                                 % line)
            record.name = cols[0]
            record.type = cols[1].rstrip('.')    # don't want '.'
        elif keyword == 'AC':
            record.accession = value.rstrip(';')
        elif keyword == 'DT':
            # e.g. from January 2017,
            # DT   01-APR-1990 CREATED; 01-APR-1990 DATA UPDATE; 01-APR-1990 INFO UPDATE.
            # Older files had brackets round the date descriptions and used MMM-YYYY
            dates = value.rstrip('.').split("; ")
            if dates[0].endswith((' (CREATED)', ' CREATED')):
                # Remove last word
                record.created = dates[0].rsplit(" ", 1)[0]
            else:
                raise ValueError("I don't understand date line\n%s" % line)
            if dates[1].endswith((' (DATA UPDATE)', ' DATA UPDATE')):
                # Remove last two words
                record.data_update = dates[1].rsplit(" ", 2)[0]
            else:
                raise ValueError("I don't understand date line\n%s" % line)
            if dates[2].endswith((' (INFO UPDATE)', ' INFO UPDATE')):
                # Remove last two words
                record.info_update = dates[2].rsplit(" ", 2)[0]
            else:
                raise ValueError("I don't understand date line\n%s" % line)
        elif keyword == 'DE':
            record.description = value
        elif keyword == 'PA':
            record.pattern += value
        elif keyword == 'MA':
            record.matrix.append(value)
        elif keyword == 'PP':
            record.postprocessing.extend(value.split(";"))
        elif keyword == 'RU':
            record.rules.append(value)
        elif keyword == 'NR':
            cols = value.split(";")
            for col in cols:
                if not col:
                    continue
                qual, data = [word.lstrip() for word in col.split("=")]
                if qual == '/RELEASE':
                    release, seqs = data.split(",")
                    record.nr_sp_release = release
                    record.nr_sp_seqs = int(seqs)
                elif qual == '/FALSE_NEG':
                    record.nr_false_neg = int(data)
                elif qual == '/PARTIAL':
                    record.nr_partial = int(data)
                elif qual in ['/TOTAL', '/POSITIVE', '/UNKNOWN', '/FALSE_POS']:
                    m = re.match(r'(\d+)\((\d+)\)', data)
                    if not m:
                        raise Exception("Broken data %s in comment line\n%s"
                                        % (repr(data), line))
                    hits = tuple(map(int, m.groups()))
                    if(qual == "/TOTAL"):
                        record.nr_total = hits
                    elif(qual == "/POSITIVE"):
                        record.nr_positive = hits
                    elif(qual == "/UNKNOWN"):
                        record.nr_unknown = hits
                    elif(qual == "/FALSE_POS"):
                        record.nr_false_pos = hits
                else:
                    raise ValueError("Unknown qual %s in comment line\n%s"
                                     % (repr(qual), line))
        elif keyword == 'CC':
            # Expect CC lines like this:
            # CC   /TAXO-RANGE=??EPV; /MAX-REPEAT=2;
            # Can (normally) split on ";" and then on "="
            cols = value.split(";")
            for col in cols:
                if not col or col[:17] == 'Automatic scaling':
                    # DNAJ_2 in Release 15 has a non-standard comment line:
                    # CC   Automatic scaling using reversed database
                    # Throw it away.  (Should I keep it?)
                    continue
                if col.count("=") == 0:
                    # Missing qualifier!  Can we recover gracefully?
                    # For example, from Bug 2403, in PS50293 have:
                    # CC /AUTHOR=K_Hofmann; N_Hulo
                    continue
                qual, data = [word.lstrip() for word in col.split("=")]
                if qual == '/TAXO-RANGE':
                    record.cc_taxo_range = data
                elif qual == '/MAX-REPEAT':
                    record.cc_max_repeat = data
                elif qual == '/SITE':
                    pos, desc = data.split(",")
                    record.cc_site.append((int(pos), desc))
                elif qual == '/SKIP-FLAG':
                    record.cc_skip_flag = data
                elif qual == '/MATRIX_TYPE':
                    record.cc_matrix_type = data
                elif qual == '/SCALING_DB':
                    record.cc_scaling_db = data
                elif qual == '/AUTHOR':
                    record.cc_author = data
                elif qual == '/FT_KEY':
                    record.cc_ft_key = data
                elif qual == '/FT_DESC':
                    record.cc_ft_desc = data
                elif qual == '/VERSION':
                    record.cc_version = data
                else:
                    raise ValueError("Unknown qual %s in comment line\n%s"
                                     % (repr(qual), line))
        elif keyword == 'DR':
            refs = value.split(";")
            for ref in refs:
                if not ref:
                    continue
                acc, name, type = [word.strip() for word in ref.split(",")]
                if type == 'T':
                    record.dr_positive.append((acc, name))
                elif type == 'F':
                    record.dr_false_pos.append((acc, name))
                elif type == 'N':
                    record.dr_false_neg.append((acc, name))
                elif type == 'P':
                    record.dr_potential.append((acc, name))
                elif type == '?':
                    record.dr_unknown.append((acc, name))
                else:
                    raise ValueError("I don't understand type flag %s" % type)
        elif keyword == '3D':
            cols = value.split()
            for id in cols:
                record.pdb_structs.append(id.rstrip(';'))
        elif keyword == 'PR':
            rules = value.split(";")
            record.prorules.extend(rules)
        elif keyword == 'DO':
            record.pdoc = value.rstrip(';')
        elif keyword == 'CC':
            continue
        elif keyword == '//':
            if not record:
                # Then this was the copyright statement
                continue
            break
        else:
            raise ValueError("Unknown keyword %s found" % keyword)
    else:
        return
    if not record:
        raise ValueError("Unexpected end of stream.")
    return record