# Copyright 2006 by Sean Davis.  All rights reserved.
# This code is part of the Biopython distribution and governed by its
# license.  Please see the LICENSE file that should have been included
# as part of this package.
#
# $Id: __init__.py,v 1.12 2009-04-24 12:03:45 mdehoon Exp $
# Sean Davis <sdavis2 at mail dot nih dot gov>
# National Cancer Institute
# National Institutes of Health
# Bethesda, MD, USA
#

"""Parse Unigene flat file format files such as the Hs.data file.

Here is an overview of the flat file format that this parser deals with:
   Line types/qualifiers:

       ID           UniGene cluster ID
       TITLE        Title for the cluster
       GENE         Gene symbol
       CYTOBAND     Cytological band
       EXPRESS      Tissues of origin for ESTs in cluster
       RESTR_EXPR   Single tissue or development stage contributes
                    more than half the total EST frequency for this gene.
       GNM_TERMINUS genomic confirmation of presence of a 3' terminus;
                    T if a non-templated polyA tail is found among
                    a cluster's sequences; else
                    I if templated As are found in genomic sequence or
                    S if a canonical polyA signal is found on
                      the genomic sequence
       GENE_ID      Entrez gene identifier associated with at least one
                    sequence in this cluster;
                    to be used instead of LocusLink.
       LOCUSLINK    LocusLink identifier associated with at least one
                    sequence in this cluster;
                    deprecated in favor of GENE_ID
       HOMOL        Homology;
       CHROMOSOME   Chromosome.  For plants, CHROMOSOME refers to mapping
                    on the arabidopsis genome.
       STS          STS
            ACC=         GenBank/EMBL/DDBJ accession number of STS
                         [optional field]
            UNISTS=      identifier in NCBI's UNISTS database
       TXMAP        Transcript map interval
            MARKER=      Marker found on at least one sequence in this
                         cluster
            RHPANEL=     Radiation Hybrid panel used to place marker
       PROTSIM      Protein Similarity data for the sequence with
                    highest-scoring protein similarity in this cluster
            ORG=         Organism
            PROTGI=      Sequence GI of protein
            PROTID=      Sequence ID of protein
            PCT=         Percent alignment
            ALN=         length of aligned region (aa)
       SCOUNT       Number of sequences in the cluster
       SEQUENCE     Sequence
            ACC=         GenBank/EMBL/DDBJ accession number of sequence
            NID=         Unique nucleotide sequence identifier (gi)
            PID=         Unique protein sequence identifier (used for
                         non-ESTs)
            CLONE=       Clone identifier (used for ESTs only)
            END=         End (5'/3') of clone insert read (used for
                         ESTs only)
            LID=         Library ID; see Hs.lib.info for library name
                         and tissue
            MGC=         5' CDS-completeness indicator; if present, the
                         clone associated with this sequence is believed
                         CDS-complete. A value greater than 511 is the gi
                         of the CDS-complete mRNA matched by the EST,
                         otherwise the value is an indicator of the
                         reliability of the test indicating CDS
                         completeness; higher values indicate more
                         reliable CDS-completeness predictions.
           SEQTYPE=      Description of the nucleotide sequence.
                         Possible values are mRNA, EST and HTC.
           TRACE=        The Trace ID of the EST sequence, as provided by
                         NCBI Trace Archive
"""


class SequenceLine(object):
    """Store the information for one SEQUENCE line from a Unigene file

    Initialize with the text part of the SEQUENCE line, or nothing.

    Attributes and descriptions (access as LOWER CASE)
    ACC=         GenBank/EMBL/DDBJ accession number of sequence
    NID=         Unique nucleotide sequence identifier (gi)
    PID=         Unique protein sequence identifier (used for non-ESTs)
    CLONE=       Clone identifier (used for ESTs only)
    END=         End (5'/3') of clone insert read (used for ESTs only)
    LID=         Library ID; see Hs.lib.info for library name and tissue
    MGC=         5' CDS-completeness indicator; if present,
                 the clone associated with this sequence
                 is believed CDS-complete. A value greater than 511
                 is the gi of the CDS-complete mRNA matched by the EST,
                 otherwise the value is an indicator of the reliability
                 of the test indicating CDS completeness;
                 higher values indicate more reliable CDS-completeness
                 predictions.
    SEQTYPE=     Description of the nucleotide sequence. Possible values
                 are mRNA, EST and HTC.
    TRACE=       The Trace ID of the EST sequence, as provided by NCBI
                 Trace Archive
    """

    def __init__(self,text=None):
        self.acc = ''
        self.nid = ''
        self.lid = ''
        self.pid = ''
        self.clone = ''
        self.image = ''
        self.is_image = False
        self.end = ''
        self.mgc = ''
        self.seqtype = ''
        self.trace = ''
        if text is not None:
            self.text=text
            self._init_from_text(text)

    def _init_from_text(self, text):
        parts = text.split('; ')
        for part in parts:
            key, val = part.split("=")
            if key=='CLONE':
                if val[:5]=='IMAGE':
                    self.is_image=True
                    self.image = val[6:]
            setattr(self, key.lower(), val)

    def __repr__(self):
        return self.text


class ProtsimLine(object):
    """Store the information for one PROTSIM line from a Unigene file

    Initialize with the text part of the PROTSIM line, or nothing.

    Attributes and descriptions (access as LOWER CASE)
    ORG=         Organism
    PROTGI=      Sequence GI of protein
    PROTID=      Sequence ID of protein
    PCT=         Percent alignment
    ALN=         length of aligned region (aa)
    """

    def __init__(self,text=None):
        self.org = ''
        self.protgi = ''
        self.protid = ''
        self.pct = ''
        self.aln = ''
        if text is not None:
            self.text=text
            self._init_from_text(text)

    def _init_from_text(self, text):
        parts = text.split('; ')

        for part in parts:
            key, val = part.split("=")
            setattr(self, key.lower(), val)

    def __repr__(self):
        return self.text


class STSLine(object):
    """Store the information for one STS line from a Unigene file

    Initialize with the text part of the STS line, or nothing.

    Attributes and descriptions (access as LOWER CASE)

    ACC=         GenBank/EMBL/DDBJ accession number of STS [optional field]
    UNISTS=      identifier in NCBI's UNISTS database
    """

    def __init__(self,text=None):
        self.acc = ''
        self.unists = ''
        if text is not None:
            self.text=text
            self._init_from_text(text)

    def _init_from_text(self, text):
        parts = text.split(' ')

        for part in parts:
            key, val = part.split("=")
            setattr(self, key.lower(), val)

    def __repr__(self):
        return self.text


class Record(object):
    """Store a Unigene record

    Here is what is stored:

        self.ID           = ''  # ID line
        self.species      = ''  # Hs, Bt, etc.
        self.title        = ''  # TITLE line
        self.symbol       = ''  # GENE line
        self.cytoband     = ''  # CYTOBAND line
        self.express      = []  # EXPRESS line, parsed on ';'
                                # Will be an array of strings
        self.restr_expr   = ''  # RESTR_EXPR line
        self.gnm_terminus = ''  # GNM_TERMINUS line
        self.gene_id      = ''  # GENE_ID line
        self.locuslink    = ''  # LOCUSLINK line
        self.homol        = ''  # HOMOL line
        self.chromosome   = ''  # CHROMOSOME line
        self.protsim      = []  # PROTSIM entries, array of Protsims
                                # Type ProtsimLine
        self.sequence     = []  # SEQUENCE entries, array of Sequence entries
                                # Type SequenceLine
        self.sts          = []  # STS entries, array of STS entries
                                # Type STSLine
        self.txmap        = []  # TXMAP entries, array of TXMap entries
    """

    def __init__(self):
        self.ID           = ''  # ID line
        self.species      = ''  # Hs, Bt, etc.
        self.title        = ''  # TITLE line
        self.symbol       = ''  # GENE line
        self.cytoband     = ''  # CYTOBAND line
        self.express      = []  # EXPRESS line, parsed on ';'
        self.restr_expr   = ''  # RESTR_EXPR line
        self.gnm_terminus = ''  # GNM_TERMINUS line
        self.gene_id      = ''  # GENE_ID line
        self.locuslink    = ''  # LOCUSLINK line
        self.homol        = ''  # HOMOL line
        self.chromosome   = ''  # CHROMOSOME line
        self.protsim      = []  # PROTSIM entries, array of Protsims
        self.sequence     = []  # SEQUENCE entries, array of Sequence entries
        self.sts          = []  # STS entries, array of STS entries
        self.txmap        = []  # TXMAP entries, array of TXMap entries

    def __repr__(self):
        return "<%s> %s %s\n%s" % (self.__class__.__name__,
                          self.ID, self.symbol, self.title)


def parse(handle):
    while True:
        record = _read(handle)
        if not record:
            return
        yield record


def read(handle):
    record = _read(handle)
    if not record:
        raise ValueError("No SwissProt record found")
    # We should have reached the end of the record by now
    remainder = handle.read()
    if remainder:
        raise ValueError("More than one SwissProt record found")
    return record


# Everything below is private


def _read(handle):
    UG_INDENT = 12
    record = None
    for line in handle:
        tag, value = line[:UG_INDENT].rstrip(), line[UG_INDENT:].rstrip()
        line = line.rstrip()
        if tag=="ID":
            record = Record()
            record.ID = value
            record.species = record.ID.split('.')[0]
        elif tag=="TITLE":
            record.title = value
        elif tag=="GENE":
            record.symbol = value
        elif tag=="GENE_ID":
            record.gene_id = value
        elif tag=="LOCUSLINK":
            record.locuslink = value
        elif tag=="HOMOL":
            if value=="YES":
                record.homol = True
            elif value=="NO":
                record.homol = True
            else:
                raise ValueError("Cannot parse HOMOL line %s" % line)
        elif tag=="EXPRESS":
            record.express = [word.strip() for word in value.split("|")]
        elif tag=="RESTR_EXPR":
            record.restr_expr = [word.strip() for word in value.split("|")]
        elif tag=="CHROMOSOME":
            record.chromosome = value
        elif tag=="CYTOBAND":
            record.cytoband = value
        elif tag=="PROTSIM":
            protsim = ProtsimLine(value)
            record.protsim.append(protsim)
        elif tag=="SCOUNT":
            scount = int(value)
        elif tag=="SEQUENCE":
            sequence = SequenceLine(value)
            record.sequence.append(sequence)
        elif tag=="STS":
            sts = STSLine(value)
            record.sts.append(sts)
        elif tag=='//':
            if len(record.sequence)!=scount:
                raise ValueError("The number of sequences specified in the record"
                                 " (%d) does not agree with the number of sequences found (%d)" % (scount, len(record.sequence)))
            return record
        else:
            raise ValueError("Unknown tag %s" % tag)
    if record:
        raise ValueError("Unexpected end of stream.")