# Copyright 2006 by Sean Davis, National Cancer Institute, NIH. # All rights reserved. # # This file is part of the Biopython distribution and governed by your # choice of the "Biopython License Agreement" or the "BSD 3-Clause License". # Please see the LICENSE file that should have been included as part of this # package. """Parse Unigene flat file format files such as the Hs.data file. Here is an overview of the flat file format that this parser deals with: Line types/qualifiers:: ID UniGene cluster ID TITLE Title for the cluster GENE Gene symbol CYTOBAND Cytological band EXPRESS Tissues of origin for ESTs in cluster RESTR_EXPR Single tissue or development stage contributes more than half the total EST frequency for this gene. GNM_TERMINUS genomic confirmation of presence of a 3' terminus; T if a non-templated polyA tail is found among a cluster's sequences; else I if templated As are found in genomic sequence or S if a canonical polyA signal is found on the genomic sequence GENE_ID Entrez gene identifier associated with at least one sequence in this cluster; to be used instead of LocusLink. LOCUSLINK LocusLink identifier associated with at least one sequence in this cluster; deprecated in favor of GENE_ID HOMOL Homology; CHROMOSOME Chromosome. For plants, CHROMOSOME refers to mapping on the arabidopsis genome. STS STS ACC= GenBank/EMBL/DDBJ accession number of STS [optional field] UNISTS= identifier in NCBI's UNISTS database TXMAP Transcript map interval MARKER= Marker found on at least one sequence in this cluster RHPANEL= Radiation Hybrid panel used to place marker PROTSIM Protein Similarity data for the sequence with highest-scoring protein similarity in this cluster ORG= Organism PROTGI= Sequence GI of protein PROTID= Sequence ID of protein PCT= Percent alignment ALN= length of aligned region (aa) SCOUNT Number of sequences in the cluster SEQUENCE Sequence ACC= GenBank/EMBL/DDBJ accession number of sequence NID= Unique nucleotide sequence identifier (gi) PID= Unique protein sequence identifier (used for non-ESTs) CLONE= Clone identifier (used for ESTs only) END= End (5'/3') of clone insert read (used for ESTs only) LID= Library ID; see Hs.lib.info for library name and tissue MGC= 5' CDS-completeness indicator; if present, the clone associated with this sequence is believed CDS-complete. A value greater than 511 is the gi of the CDS-complete mRNA matched by the EST, otherwise the value is an indicator of the reliability of the test indicating CDS completeness; higher values indicate more reliable CDS-completeness predictions. SEQTYPE= Description of the nucleotide sequence. Possible values are mRNA, EST and HTC. TRACE= The Trace ID of the EST sequence, as provided by NCBI Trace Archive """ class SequenceLine(object): """Store the information for one SEQUENCE line from a Unigene file. Initialize with the text part of the SEQUENCE line, or nothing. Attributes and descriptions (access as LOWER CASE): - ACC= GenBank/EMBL/DDBJ accession number of sequence - NID= Unique nucleotide sequence identifier (gi) - PID= Unique protein sequence identifier (used for non-ESTs) - CLONE= Clone identifier (used for ESTs only) - END= End (5'/3') of clone insert read (used for ESTs only) - LID= Library ID; see Hs.lib.info for library name and tissue - MGC= 5' CDS-completeness indicator; if present, the clone associated with this sequence is believed CDS-complete. A value greater than 511 is the gi of the CDS-complete mRNA matched by the EST, otherwise the value is an indicator of the reliability of the test indicating CDS completeness; higher values indicate more reliable CDS-completeness predictions. - SEQTYPE= Description of the nucleotide sequence. Possible values are mRNA, EST and HTC. - TRACE= The Trace ID of the EST sequence, as provided by NCBI Trace Archive """ def __init__(self, text=None): """Initialize the class.""" self.acc = "" self.nid = "" self.lid = "" self.pid = "" self.clone = "" self.image = "" self.is_image = False self.end = "" self.mgc = "" self.seqtype = "" self.trace = "" if text is not None: self.text = text self._init_from_text(text) def _init_from_text(self, text): parts = text.split("; ") for part in parts: key, val = part.split("=") if key == "CLONE": if val[:5] == "IMAGE": self.is_image = True self.image = val[6:] setattr(self, key.lower(), val) def __repr__(self): """Return UniGene SequenceLine object as a string.""" return self.text class ProtsimLine(object): """Store the information for one PROTSIM line from a Unigene file. Initialize with the text part of the PROTSIM line, or nothing. Attributes and descriptions (access as LOWER CASE) ORG= Organism PROTGI= Sequence GI of protein PROTID= Sequence ID of protein PCT= Percent alignment ALN= length of aligned region (aa) """ def __init__(self, text=None): """Initialize the class.""" self.org = "" self.protgi = "" self.protid = "" self.pct = "" self.aln = "" if text is not None: self.text = text self._init_from_text(text) def _init_from_text(self, text): parts = text.split("; ") for part in parts: key, val = part.split("=") setattr(self, key.lower(), val) def __repr__(self): """Return UniGene ProtsimLine object as a string.""" return self.text class STSLine(object): """Store the information for one STS line from a Unigene file. Initialize with the text part of the STS line, or nothing. Attributes and descriptions (access as LOWER CASE) ACC= GenBank/EMBL/DDBJ accession number of STS [optional field] UNISTS= identifier in NCBI's UNISTS database """ def __init__(self, text=None): """Initialize the class.""" self.acc = "" self.unists = "" if text is not None: self.text = text self._init_from_text(text) def _init_from_text(self, text): parts = text.split(" ") for part in parts: key, val = part.split("=") setattr(self, key.lower(), val) def __repr__(self): """Return UniGene STSLine object as a string.""" return self.text class Record(object): """Store a Unigene record. Here is what is stored:: self.ID = '' # ID line self.species = '' # Hs, Bt, etc. self.title = '' # TITLE line self.symbol = '' # GENE line self.cytoband = '' # CYTOBAND line self.express = [] # EXPRESS line, parsed on ';' # Will be an array of strings self.restr_expr = '' # RESTR_EXPR line self.gnm_terminus = '' # GNM_TERMINUS line self.gene_id = '' # GENE_ID line self.locuslink = '' # LOCUSLINK line self.homol = '' # HOMOL line self.chromosome = '' # CHROMOSOME line self.protsim = [] # PROTSIM entries, array of Protsims # Type ProtsimLine self.sequence = [] # SEQUENCE entries, array of Sequence entries # Type SequenceLine self.sts = [] # STS entries, array of STS entries # Type STSLine self.txmap = [] # TXMAP entries, array of TXMap entries """ def __init__(self): """Initialize the class.""" self.ID = "" # ID line self.species = "" # Hs, Bt, etc. self.title = "" # TITLE line self.symbol = "" # GENE line self.cytoband = "" # CYTOBAND line self.express = [] # EXPRESS line, parsed on ';' self.restr_expr = "" # RESTR_EXPR line self.gnm_terminus = "" # GNM_TERMINUS line self.gene_id = "" # GENE_ID line self.locuslink = "" # LOCUSLINK line self.homol = "" # HOMOL line self.chromosome = "" # CHROMOSOME line self.protsim = [] # PROTSIM entries, array of Protsims self.sequence = [] # SEQUENCE entries, array of Sequence entries self.sts = [] # STS entries, array of STS entries self.txmap = [] # TXMAP entries, array of TXMap entries def __repr__(self): """Represent the UniGene Record object as a string for debugging.""" return "<%s> %s %s %s" % ( self.__class__.__name__, self.ID, self.symbol, self.title, ) def parse(handle): """Read and load a UniGene records, for files containing multiple records.""" while True: record = _read(handle) if not record: return yield record def read(handle): """Read and load a UniGene record, one record per file.""" record = _read(handle) if not record: raise ValueError("No SwissProt record found") # We should have reached the end of the record by now remainder = handle.read() if remainder: raise ValueError("More than one SwissProt record found") return record # Everything below is private def _read(handle): UG_INDENT = 12 record = None for line in handle: tag, value = line[:UG_INDENT].rstrip(), line[UG_INDENT:].rstrip() line = line.rstrip() if tag == "ID": record = Record() record.ID = value record.species = record.ID.split(".")[0] elif tag == "TITLE": record.title = value elif tag == "GENE": record.symbol = value elif tag == "GENE_ID": record.gene_id = value elif tag == "LOCUSLINK": record.locuslink = value elif tag == "HOMOL": if value == "YES": record.homol = True elif value == "NO": record.homol = True else: raise ValueError("Cannot parse HOMOL line %s" % line) elif tag == "EXPRESS": record.express = [word.strip() for word in value.split("|")] elif tag == "RESTR_EXPR": record.restr_expr = [word.strip() for word in value.split("|")] elif tag == "CHROMOSOME": record.chromosome = value elif tag == "CYTOBAND": record.cytoband = value elif tag == "PROTSIM": protsim = ProtsimLine(value) record.protsim.append(protsim) elif tag == "SCOUNT": scount = int(value) elif tag == "SEQUENCE": sequence = SequenceLine(value) record.sequence.append(sequence) elif tag == "STS": sts = STSLine(value) record.sts.append(sts) elif tag == "//": if len(record.sequence) != scount: raise ValueError( "The number of sequences specified in the record " "(%d) does not agree with the number of sequences found (%d)" % (scount, len(record.sequence)) ) return record else: raise ValueError("Unknown tag %s" % tag) if record: raise ValueError("Unexpected end of stream.")