# Copyright 2003 by Bartek Wilczynski.  All rights reserved.
# This code is part of the Biopython distribution and governed by its
# license.  Please see the LICENSE file that should have been included
# as part of this package.

"""Parsing TRANSFAC files."""


from Bio import motifs


class Motif(motifs.Motif, dict):
    """Store the information for one TRANSFAC motif.

    This class inherits from the Bio.motifs.Motif base class, as well
    as from a Python dictionary. All motif information found by the parser
    is stored as attributes of the base class when possible; see the
    Bio.motifs.Motif base class for a description of these attributes. All
    other information associated with the motif is stored as (key, value)
    pairs in the dictionary, where the key is the two-letter fields as found
    in the TRANSFAC file. References are an exception: These are stored in
    the .references attribute.

    These fields are commonly found in TRANSFAC files::

        AC:    Accession number
        AS:    Accession numbers, secondary
        BA:    Statistical basis
        BF:    Binding factors
        BS:    Factor binding sites underlying the matrix
               [sequence; SITE accession number; start position for matrix
               sequence; length of sequence used; number of gaps inserted;
               strand orientation.]
        CC:    Comments
        CO:    Copyright notice
        DE:    Short factor description
        DR:    External databases
               [database name: database accession number]
        DT:    Date created/updated
        HC:    Subfamilies
        HP:    Superfamilies
        ID:    Identifier
        NA:    Name of the binding factor
        OC:    Taxonomic classification
        OS:    Species/Taxon
        OV:    Older version
        PV:    Preferred version
        TY:    Type
        XX:    Empty line; these are not stored in the Record.

    References are stored in an .references attribute, which is a list of
    dictionaries with the following keys::

        RN:    Reference number
        RA:    Reference authors
        RL:    Reference data
        RT:    Reference title
        RX:    PubMed ID

    For more information, see the TRANSFAC documentation.
    """

    multiple_value_keys = {"BF", "OV", "HP", "BS", "HC", "DT", "DR"}
    # These keys can occur multiple times for one motif

    reference_keys = {"RX", "RA", "RT", "RL"}
    # These keys occur for references


class Record(list):
    """Store the information in a TRANSFAC matrix table.

    The record inherits from a list containing the individual motifs.

    Attributes:
     - version - The version number, corresponding to the 'VV' field
       in the TRANSFAC file;

    """

    def __init__(self):
        """Initialize."""
        self.version = None

    def __str__(self):
        """Turn the TRANSFAC matrix into a string."""
        return write(self)


def read(handle, strict=True):
    """Parse a transfac format handle into a Record object."""
    annotations = {}
    references = []
    counts = None
    record = Record()
    for line in handle:
        line = line.strip()
        if not line:
            continue
        key_value = line.split(None, 1)
        key = key_value[0].strip()
        if strict:
            if len(key) != 2:
                raise ValueError("The key value of a TRANSFAC motif line "
                                 "should have 2 characters: "
                                 '"{0:s}"'.format(line))
        if len(key_value) == 2:
            value = key_value[1].strip()
            if strict:
                if not line.partition("  ")[1]:
                    raise ValueError("A TRANSFAC motif line should have 2 "
                                     "spaces between key and value columns: "
                                     '"{0:s}"'.format(line))
        if key == "VV":
            record.version = value
        elif key in ("P0", "PO"):  # Old TRANSFAC files use PO instead of P0
            counts = {}
            if value.split()[:4] != ["A", "C", "G", "T"]:
                raise ValueError('A TRANSFAC matrix "{0:s}" line should be '
                                 'followed by "A C G T": '
                                 '"{0:s}"'.format(key, line))
            length = 0
            for c in "ACGT":
                counts[c] = []
            for line in handle:
                line = line.strip()
                key_value = line.split(None, 1)
                key = key_value[0].strip()
                if len(key_value) == 2:
                    value = key_value[1].strip()
                    if strict:
                        if not line.partition("  ")[1]:
                            raise ValueError("A TRANSFAC motif line should "
                                             "have 2 spaces between key and "
                                             "value columns: "
                                             '"{0:s}"'.format(line))
                try:
                    i = int(key)
                except ValueError:
                    break
                if length == 0 and i == 0:
                    if strict:
                        raise ValueError("A TRANSFAC matrix should start with "
                                         '"01" as first row of the matrix, '
                                         'but this matrix uses "00": '
                                         '"{0:s}"'.format(line))
                else:
                    length += 1
                if i != length:
                    raise ValueError("The TRANSFAC matrix row number does not "
                                     "match the position in the matrix: "
                                     '"{0:s}"'.format(line))
                if strict:
                    if len(key) == 1:
                        raise ValueError("A TRANSFAC matrix line should have a "
                                         "2 digit key at the start of the line "
                                         '("{0:02d}"), but this matrix uses '
                                         '"{0:d}": "{1:s}".'.format(i, line))
                    if len(key_value) != 2:
                        raise ValueError("A TRANSFAC matrix line should have "
                                         "a key and a value: "
                                         '"{0:s}"'.format(line))
                values = value.split()[:4]
                if len(values) != 4:
                    raise ValueError("A TRANSFAC matrix line should have a "
                                     "value for each nucleotide "
                                     '(A, C, G and T): "{0:s}"'.format(line))
                for c, v in zip("ACGT", values):
                    counts[c].append(float(v))
        if line == "XX":
            pass
        elif key == "RN":
            index, separator, accession = value.partition(";")
            if index[0] != "[":
                raise ValueError('The index "{0:s}" in a TRANSFAC RN line '
                                 "should start with a "
                                 '"[": "{0:s}"'.format(index, line))
            if index[-1] != "]":
                raise ValueError('The index "{0:s}" in a TRANSFAC RN line '
                                 "should end with a "
                                 '"]": "{0:s}"'.format(index, line))
            index = int(index[1:-1])
            if len(references) != index - 1:
                raise ValueError('The index "{0:d}" of the TRANSFAC RN line '
                                 "does not match the current number of seen "
                                 'references "{1:d}": "{2:s}"'.format(index, len(references) + 1, line))
            reference = {key: value}
            references.append(reference)
        elif key == "//":
            if counts is not None:
                motif = Motif(alphabet="ACGT", counts=counts)
                motif.update(annotations)
                motif.references = references
                record.append(motif)
            annotations = {}
            references = []
        elif key in Motif.reference_keys:
            reference[key] = value
        elif key in Motif.multiple_value_keys:
            if key not in annotations:
                annotations[key] = []
            annotations[key].append(value)
        else:
            annotations[key] = value
    return record


def write(motifs):
    """Write the representation of a motif in TRANSFAC format."""
    blocks = []
    try:
        version = motifs.version
    except AttributeError:
        pass
    else:
        if version is not None:
            block = """\
VV  %s
XX
//
""" % version
            blocks.append(block)
    multiple_value_keys = Motif.multiple_value_keys
    sections = (("AC", "AS",),  # Accession
                ("ID",),        # ID
                ("DT", "CO"),   # Date, copyright
                ("NA",),        # Name
                ("DE",),        # Short factor description
                ("TY",),        # Type
                ("OS", "OC"),   # Organism
                ("HP", "HC"),   # Superfamilies, subfamilies
                ("BF",),        # Binding factors
                ("P0",),        # Frequency matrix
                ("BA",),        # Statistical basis
                ("BS",),        # Factor binding sites
                ("CC",),        # Comments
                ("DR",),        # External databases
                ("OV", "PV",),  # Versions
                )
    for motif in motifs:
        lines = []
        for section in sections:
            blank = False
            for key in section:
                if key == "P0":
                    # Frequency matrix
                    length = motif.length
                    if length == 0:
                        continue
                    sequence = motif.degenerate_consensus
                    letters = sorted(motif.alphabet)
                    line = "      ".join(["P0"] + letters)

                    lines.append(line)
                    for i in range(length):
                        line = " ".join(
                            ["%02.d"] +
                            ["%6.20g" for l in letters]) + \
                            "      %s"
                        line = line % tuple(
                            [i + 1] +
                            [motif.counts[l][i] for l in letters] +
                            [sequence[i]]
                        )
                        lines.append(line)
                    blank = True
                else:
                    try:
                        value = motif.get(key)
                    except AttributeError:
                        value = None
                    if value is not None:
                        if key in multiple_value_keys:
                            for v in value:
                                line = "%s  %s" % (key, v)
                                lines.append(line)
                        else:
                            line = "%s  %s" % (key, value)
                            lines.append(line)
                        blank = True
                if key == "PV":
                    # References
                    try:
                        references = motif.references
                    except AttributeError:
                        pass
                    else:
                        keys = ("RN", "RX", "RA", "RT", "RL")
                        for reference in references:
                            for key in keys:
                                value = reference.get(key)
                                if value is None:
                                    continue
                                line = "%s  %s" % (key, value)
                                lines.append(line)
                                blank = True
            if blank:
                line = "XX"
                lines.append(line)
        # Finished this motif; glue the lines together
        line = "//"
        lines.append(line)
        block = "\n".join(lines) + "\n"
        blocks.append(block)
    # Finished all motifs; glue the blocks together
    text = "".join(blocks)
    return text