# Copyright 2003 by Bartek Wilczynski.  All rights reserved.
# This code is part of the Biopython distribution and governed by its
# license.  Please see the LICENSE file that should have been included
# as part of this package.

"""Parsing TRANSFAC files
"""

from Bio import motifs
from Bio.Alphabet import IUPAC


class Motif(motifs.Motif, dict):
    """A Bio.motifs.transfac.Motif stores the information in one TRANSFAC
motif. This class inherits from the Bio.motifs.Motif base class, as well
as from a Python dictionary. All motif information found by the parser
is stored as attributes of the base class when possible; see the
Bio.motifs.Motif base class for a description of these attributes. All
other information associated with the motif is stored as (key, value)
pairs in the dictionary, where the key is the two-letter fields as found
in the TRANSFAC file. References are an exception: These are stored in
the .references attribute.

These fields are commonly found in TRANSFAC files:
    AC:    Accession number
    AS:    Accession numbers, secondary
    BA:    Statistical basis
    BF:    Binding factors
    BS:    Factor binding sites underlying the matrix
           [sequence; SITE accession number; start position for matrix
            sequence; length of sequence used; number of gaps inserted;
            strand orientation.]
    CC:    Comments
    CO:    Copyright notice
    DE:    Short factor description
    DR:    External databases
           [database name: database accession number]
    DT:    Date created/updated
    HC:    Subfamilies
    HP:    Superfamilies
    ID:    Identifier
    NA:    Name of the binding factor
    OC:    Taxonomic classification
    OS:    Species/Taxon
    OV:    Older version
    PV:    Preferred version
    TY:    Type
    XX:    Empty line; these are not stored in the Record.

References are stored in an .references attribute, which is a list of
dictionaries with the following keys:
    RN:    Reference number
    RA:    Reference authors
    RL:    Reference data
    RT:    Reference title
    RX:    PubMed ID

For more information, see the TRANSFAC documentation.
"""
    multiple_value_keys = set(['BF', 'OV', 'HP', 'BS', 'HC', 'DT', 'DR'])
    # These keys can occur multiple times for one motif

    reference_keys = set(['RX', 'RA', 'RT', 'RL'])
    # These keys occur for references


class Record(list):
    """A Bio.motifs.transfac.Record stores the information in a TRANSFAC
matrix table. The record inherits from a list containing the individual
motifs.

Attributes:
    o version:   The version number, corresponding to the 'VV' field
                 in the TRANSFAC file;
"""
    def __init__(self):
        self.version = None

    def __str__(self):
        return write(self)


def read(handle):
    """record = read(handle)"""
    annotations = {}
    references = []
    counts = None
    record = Record()
    for line in handle:
        line = line.strip()
        key, value = line[:2], line[4:]
        if key=='VV':
            record.version = value
        elif key in ('P0', 'PO'): # Old TRANSFAC files use PO instead of P0 
            counts = {}
            assert value.split()[:4]==['A', 'C', 'G', 'T']
            length = 0
            for c in "ACGT":
                counts[c] = []
            for line in handle:
                key, value = line[:2], line[4:]
                try:
                    i = int(key)
                except ValueError:
                    break
                length+=1
                assert i==length
                values = value.split()
                for c, v in zip("ACGT", values):
                    counts[c].append(float(v))
        if line=='XX':
            pass
        elif key=='RN':
            index, separator, accession = value.partition(";")
            assert index[0]=='['
            assert index[-1]==']'
            index = int(index[1:-1])
            assert len(references)==index-1
            reference = {key: value}
            references.append(reference)
        elif key=='//':
            if counts is not None:
                motif = Motif(alphabet=IUPAC.unambiguous_dna, counts=counts)
                motif.update(annotations)
                motif.references = references
                record.append(motif)
            annotations = {}
            references = []
        elif key in Motif.reference_keys:
            reference[key] = value
        elif key in Motif.multiple_value_keys:
            if not key in annotations:
                annotations[key] = []
            annotations[key].append(value)
        else:
            annotations[key] = value
    return record

def write(motifs):
    """Write the representation of a motif in TRANSFAC format
    """
    blocks = []
    try:
        version = motifs.version
    except AttributeError:
        pass
    else:
        if version is not None:
            block = """\
VV  %s
XX
//
""" % version
            blocks.append(block)
    multiple_value_keys = Motif.multiple_value_keys
    sections = (('AC', 'AS',), # Accession
                ('ID',),       # ID
                ('DT', 'CO'),  # Date, copyright
                ('NA',),       # Name
                ('DE',),       # Short factor description
                ('TY',),       # Type
                ('OS', 'OC'),  # Organism
                ('HP', 'HC'),  # Superfamilies, subfamilies
                ('BF',),       # Binding factors
                ('P0',),       # Frequency matrix
                ('BA',),       # Statistical basis
                ('BS',),       # Factor binding sites
                ('CC',),       # Comments
                ('DR',),       # External databases
                ('OV', 'PV',), # Versions
               )
    for motif in motifs:
        lines = []
        for section in sections:
            blank = False
            for key in section:
                if key=='P0':
                    # Frequency matrix
                    length = motif.length
                    if length==0:
                        continue
                    sequence = motif.degenerate_consensus
                    line = "P0      A      C      G      T"
                    lines.append(line)
                    for i in range(length):
                        line = "%02.d %6.20g %6.20g %6.20g %6.20g      %s" % (
                                             i+1,
                                             motif.counts['A'][i],
                                             motif.counts['C'][i],
                                             motif.counts['G'][i],
                                             motif.counts['T'][i],
                                             sequence[i],
                                            )
                        lines.append(line)
                    blank = True
                else:
                    try:
                        value = motif.get(key)
                    except AttributeError:
                        value = None
                    if value is not None:
                        if key in multiple_value_keys:
                            for v in value:
                                line = "%s  %s" % (key, v)
                                lines.append(line)
                        else:
                            line = "%s  %s" % (key, value)
                            lines.append(line)
                        blank = True
                if key=='PV':
                    # References
                    try:
                        references = motif.references
                    except AttributeError:
                        pass
                    else:
                        keys = ("RN", "RX", "RA", "RT", "RL")
                        for reference in references:
                            for key in keys:
                                value = reference.get(key)
                                if value is None:
                                    continue
                                line = "%s  %s" % (key, value)
                                lines.append(line)
                                blank = True
            if blank:
                line = 'XX'
                lines.append(line)
        # Finished this motif; glue the lines together
        line = "//"
        lines.append(line)
        block = "\n".join(lines) + "\n"
        blocks.append(block)
    # Finished all motifs; glue the blocks together
    text = "".join(blocks)
    return text