# Copyright 2006 by Sean Davis. All rights reserved. # This code is part of the Biopython distribution and governed by its # license. Please see the LICENSE file that should have been included # as part of this package. # # $Id: __init__.py,v 1.12 2009-04-24 12:03:45 mdehoon Exp $ # Sean Davis # National Cancer Institute # National Institutes of Health # Bethesda, MD, USA # """Parse Unigene flat file format files such as the Hs.data file. Here is an overview of the flat file format that this parser deals with: Line types/qualifiers: ID UniGene cluster ID TITLE Title for the cluster GENE Gene symbol CYTOBAND Cytological band EXPRESS Tissues of origin for ESTs in cluster RESTR_EXPR Single tissue or development stage contributes more than half the total EST frequency for this gene. GNM_TERMINUS genomic confirmation of presence of a 3' terminus; T if a non-templated polyA tail is found among a cluster's sequences; else I if templated As are found in genomic sequence or S if a canonical polyA signal is found on the genomic sequence GENE_ID Entrez gene identifier associated with at least one sequence in this cluster; to be used instead of LocusLink. LOCUSLINK LocusLink identifier associated with at least one sequence in this cluster; deprecated in favor of GENE_ID HOMOL Homology; CHROMOSOME Chromosome. For plants, CHROMOSOME refers to mapping on the arabidopsis genome. STS STS ACC= GenBank/EMBL/DDBJ accession number of STS [optional field] UNISTS= identifier in NCBI's UNISTS database TXMAP Transcript map interval MARKER= Marker found on at least one sequence in this cluster RHPANEL= Radiation Hybrid panel used to place marker PROTSIM Protein Similarity data for the sequence with highest-scoring protein similarity in this cluster ORG= Organism PROTGI= Sequence GI of protein PROTID= Sequence ID of protein PCT= Percent alignment ALN= length of aligned region (aa) SCOUNT Number of sequences in the cluster SEQUENCE Sequence ACC= GenBank/EMBL/DDBJ accession number of sequence NID= Unique nucleotide sequence identifier (gi) PID= Unique protein sequence identifier (used for non-ESTs) CLONE= Clone identifier (used for ESTs only) END= End (5'/3') of clone insert read (used for ESTs only) LID= Library ID; see Hs.lib.info for library name and tissue MGC= 5' CDS-completeness indicator; if present, the clone associated with this sequence is believed CDS-complete. A value greater than 511 is the gi of the CDS-complete mRNA matched by the EST, otherwise the value is an indicator of the reliability of the test indicating CDS completeness; higher values indicate more reliable CDS-completeness predictions. SEQTYPE= Description of the nucleotide sequence. Possible values are mRNA, EST and HTC. TRACE= The Trace ID of the EST sequence, as provided by NCBI Trace Archive """ class SequenceLine: """Store the information for one SEQUENCE line from a Unigene file Initialize with the text part of the SEQUENCE line, or nothing. Attributes and descriptions (access as LOWER CASE) ACC= GenBank/EMBL/DDBJ accession number of sequence NID= Unique nucleotide sequence identifier (gi) PID= Unique protein sequence identifier (used for non-ESTs) CLONE= Clone identifier (used for ESTs only) END= End (5'/3') of clone insert read (used for ESTs only) LID= Library ID; see Hs.lib.info for library name and tissue MGC= 5' CDS-completeness indicator; if present, the clone associated with this sequence is believed CDS-complete. A value greater than 511 is the gi of the CDS-complete mRNA matched by the EST, otherwise the value is an indicator of the reliability of the test indicating CDS completeness; higher values indicate more reliable CDS-completeness predictions. SEQTYPE= Description of the nucleotide sequence. Possible values are mRNA, EST and HTC. TRACE= The Trace ID of the EST sequence, as provided by NCBI Trace Archive """ def __init__(self,text=None): self.acc = '' self.nid = '' self.lid = '' self.pid = '' self.clone = '' self.image = '' self.is_image = False self.end = '' self.mgc = '' self.seqtype = '' self.trace = '' if not text==None: self.text=text self._init_from_text(text) def _init_from_text(self,text): parts = text.split('; '); for part in parts: key, val = part.split("=") if key=='CLONE': if val[:5]=='IMAGE': self.is_image=True self.image = val[6:] setattr(self,key.lower(),val) def __repr__(self): return self.text class ProtsimLine: """Store the information for one PROTSIM line from a Unigene file Initialize with the text part of the PROTSIM line, or nothing. Attributes and descriptions (access as LOWER CASE) ORG= Organism PROTGI= Sequence GI of protein PROTID= Sequence ID of protein PCT= Percent alignment ALN= length of aligned region (aa) """ def __init__(self,text=None): self.org = '' self.protgi = '' self.protid = '' self.pct = '' self.aln = '' if not text==None: self.text=text self._init_from_text(text) def _init_from_text(self,text): parts = text.split('; '); for part in parts: key, val = part.split("=") setattr(self,key.lower(),val) def __repr__(self): return self.text class STSLine: """Store the information for one STS line from a Unigene file Initialize with the text part of the STS line, or nothing. Attributes and descriptions (access as LOWER CASE) ACC= GenBank/EMBL/DDBJ accession number of STS [optional field] UNISTS= identifier in NCBI's UNISTS database """ def __init__(self,text=None): self.acc = '' self.unists = '' if not text==None: self.text=text self._init_from_text(text) def _init_from_text(self,text): parts = text.split(' '); for part in parts: key, val = part.split("=") setattr(self,key.lower(),val) def __repr__(self): return self.text class Record: """Store a Unigene record Here is what is stored: self.ID = '' # ID line self.species = '' # Hs, Bt, etc. self.title = '' # TITLE line self.symbol = '' # GENE line self.cytoband = '' # CYTOBAND line self.express = [] # EXPRESS line, parsed on ';' # Will be an array of strings self.restr_expr = '' # RESTR_EXPR line self.gnm_terminus = '' # GNM_TERMINUS line self.gene_id = '' # GENE_ID line self.locuslink = '' # LOCUSLINK line self.homol = '' # HOMOL line self.chromosome = '' # CHROMOSOME line self.protsim = [] # PROTSIM entries, array of Protsims # Type ProtsimLine self.sequence = [] # SEQUENCE entries, array of Sequence entries # Type SequenceLine self.sts = [] # STS entries, array of STS entries # Type STSLine self.txmap = [] # TXMAP entries, array of TXMap entries """ def __init__(self): self.ID = '' # ID line self.species = '' # Hs, Bt, etc. self.title = '' # TITLE line self.symbol = '' # GENE line self.cytoband = '' # CYTOBAND line self.express = [] # EXPRESS line, parsed on ';' self.restr_expr = '' # RESTR_EXPR line self.gnm_terminus = '' # GNM_TERMINUS line self.gene_id = '' # GENE_ID line self.locuslink = '' # LOCUSLINK line self.homol = '' # HOMOL line self.chromosome = '' # CHROMOSOME line self.protsim = [] # PROTSIM entries, array of Protsims self.sequence = [] # SEQUENCE entries, array of Sequence entries self.sts = [] # STS entries, array of STS entries self.txmap = [] # TXMAP entries, array of TXMap entries def __repr__(self): return "<%s> %s %s\n%s" % (self.__class__.__name__, self.ID, self.symbol, self.title) def parse(handle): while True: record = _read(handle) if not record: return yield record def read(handle): record = _read(handle) if not record: raise ValueError("No SwissProt record found") # We should have reached the end of the record by now remainder = handle.read() if remainder: raise ValueError("More than one SwissProt record found") return record # Everything below is private def _read(handle): UG_INDENT = 12 record = None for line in handle: tag, value = line[:UG_INDENT].rstrip(), line[UG_INDENT:].rstrip() line = line.rstrip() if tag=="ID": record = Record() record.ID = value record.species = record.ID.split('.')[0] elif tag=="TITLE": record.title = value elif tag=="GENE": record.symbol = value elif tag=="GENE_ID": record.gene_id = value elif tag=="LOCUSLINK": record.locuslink = value elif tag=="HOMOL": if value=="YES": record.homol = True elif value=="NO": record.homol = True else: raise ValueError, "Cannot parse HOMOL line %s" % line elif tag=="EXPRESS": record.express = [word.strip() for word in value.split("|")] elif tag=="RESTR_EXPR": record.restr_expr = [word.strip() for word in value.split("|")] elif tag=="CHROMOSOME": record.chromosome = value elif tag=="CYTOBAND": record.cytoband = value elif tag=="PROTSIM": protsim = ProtsimLine(value) record.protsim.append(protsim) elif tag=="SCOUNT": scount = int(value) elif tag=="SEQUENCE": sequence = SequenceLine(value) record.sequence.append(sequence) elif tag=="STS": sts = STSLine(value) record.sts.append(sts) elif tag=='//': if len(record.sequence)!=scount: raise ValueError, "The number of sequences specified in the record (%d) does not agree with the number of sequences found (%d)" % (scount, len(record.sequence)) return record else: raise ValueError, "Unknown tag %s" % tag if record: raise ValueError("Unexpected end of stream.") # Everything below is deprecated from Bio.ParserSupport import * import re # # CONSTANTS # UG_INDENT=12 class UnigeneSequenceRecord: """Store the information for one SEQUENCE line from a Unigene file (DEPRECATED). Initialize with the text part of the SEQUENCE line, or nothing. Attributes and descriptions (access as LOWER CASE) ACC= GenBank/EMBL/DDBJ accession number of sequence NID= Unique nucleotide sequence identifier (gi) PID= Unique protein sequence identifier (used for non-ESTs) CLONE= Clone identifier (used for ESTs only) END= End (5'/3') of clone insert read (used for ESTs only) LID= Library ID; see Hs.lib.info for library name and tissue MGC= 5' CDS-completeness indicator; if present, the clone associated with this sequence is believed CDS-complete. A value greater than 511 is the gi of the CDS-complete mRNA matched by the EST, otherwise the value is an indicator of the reliability of the test indicating CDS comleteness; higher values indicate more reliable CDS-completeness predictions. SEQTYPE= Description of the nucleotide sequence. Possible values are mRNA, EST and HTC. TRACE= The Trace ID of the EST sequence, as provided by NCBI Trace Archive PERIPHERAL= Indicator that the sequence is a suboptimal representative of the gene represented by this cluster. Peripheral sequences are those that are in a cluster which represents a spliced gene without sharing a splice junction with any other sequence. In many cases, they are unspliced transcripts originating from the gene. This class is DEPRECATED; please use the read() function in this module instead. """ def __init__(self,text=None): import warnings warnings.warn("Bio.UniGene.UnigeneSequenceRecord is deprecated; please use the read() function in this module instead", DeprecationWarning) self.acc = '' self.nid = '' self.lid = '' self.pid = '' self.clone = '' self.image = '' self.is_image = False self.end = '' self.mgc = '' self.seqtype = '' self.Trace = '' self.peripheral = '' if not text==None: self.text=text return self._init_from_text(text) def _init_from_text(self,text): parts = text.split('; '); for part in parts: key,val = re.match('(\w+)=(\S+)',part).groups() if key=='CLONE': if val[:5]=='IMAGE': self.is_image=True self.image = val[6:] setattr(self,key.lower(),val) def __repr__(self): return self.text class UnigeneProtsimRecord: """Store the information for one PROTSIM line from a Unigene file (DEPRECATED). Initialize with the text part of the PROTSIM line, or nothing. Attributes and descriptions (access as LOWER CASE) ORG= Organism PROTGI= Sequence GI of protein PROTID= Sequence ID of protein PCT= Percent alignment ALN= length of aligned region (aa) This class is DEPRECATED; please use the read() function in this module instead. """ def __init__(self,text=None): import warnings warnings.warn("Bio.UniGene.UnigeneProtsimRecord is deprecated; please use the read() function in this module instead", DeprecationWarning) self.org = '' self.protgi = '' self.protid = '' self.pct = '' self.aln = '' if not text==None: self.text=text return self._init_from_text(text) def _init_from_text(self,text): parts = text.split('; '); for part in parts: key,val = re.match('(\w+)=(\S+)',part).groups() setattr(self,key.lower(),val) def __repr__(self): return self.text class UnigeneSTSRecord: """Store the information for one STS line from a Unigene file (DEPRECATED). Initialize with the text part of the STS line, or nothing. Attributes and descriptions (access as LOWER CASE) NAME= Name of STS ACC= GenBank/EMBL/DDBJ accession number of STS [optional field] DSEG= GDB Dsegment number [optional field] UNISTS= identifier in NCBI's UNISTS database This class is DEPRECATED; please use the read() function in this module instead. """ def __init__(self,text=None): import warnings warnings.warn("Bio.UniGene.UnigeneSTSRecord is deprecated; please use the read() function in this module instead", DeprecationWarning) self.name = '' self.acc = '' self.dseg = '' self.unists = '' if not text==None: self.text=text return self._init_from_text(text) def _init_from_text(self,text): parts = text.split(' '); for part in parts: key,val = re.match('(\w+)=(\S+)',part).groups() setattr(self,key.lower(),val) def __repr__(self): return self.text class UnigeneRecord: """Store a Unigene record (DEPRECATED). Here is what is stored: self.ID = '' # ID line self.species = '' # Hs, Bt, etc. self.title = '' # TITLE line self.symbol = '' # GENE line self.cytoband = '' # CYTOBAND line self.express = [] # EXPRESS line, parsed on ';' # Will be an array of strings self.restr_expr = '' # RESTR_EXPR line self.gnm_terminus = '' # GNM_TERMINUS line self.gene_id = '' # GENE_ID line self.chromosome = '' # CHROMOSOME self.protsim = [] # PROTSIM entries, array of Protsims # Type UnigeneProtsimRecord self.sequence = [] # SEQUENCE entries, array of Sequence entries # Type UnigeneSequenceRecord self.sts = [] # STS entries, array of STS entries # Type UnigeneSTSRecord self.txmap = [] # TXMAP entries, array of TXMap entries This class is DEPRECATED; please use the read() function in this module instead. """ def __init__(self): import warnings warnings.warn("Bio.UniGene.UnigeneRecord is deprecated; please use the read() function in this module instead", DeprecationWarning) self.ID = '' # ID line self.species = '' # Hs, Bt, etc. self.title = '' # TITLE line self.symbol = '' # GENE line self.cytoband = '' # CYTOBAND line self.express = [] # EXPRESS line, parsed on ';' self.restr_expr = '' # RESTR_EXPR line self.gnm_terminus = '' # GNM_TERMINUS line self.gene_id = '' # GENE_ID line self.chromosome = '' # CHROMOSOME self.protsim = [] # PROTSIM entries, array of Protsims self.sequence = [] # SEQUENCE entries, array of Sequence entries self.sts = [] # STS entries, array of STS entries self.txmap = [] # TXMAP entries, array of TXMap entries def __repr__(self): return "<%s> %s %s\n%s" % (self.__class__.__name__, self.ID, self.symbol, self.title) class _RecordConsumer(AbstractConsumer): """This class is DEPRECATED; please use the read() function in this module instead.""" def __init__(self): import warnings warnings.warn("Bio.UniGene._RecordConsumer is deprecated; please use the read() function in this module instead", DeprecationWarning) self.unigene_record = UnigeneRecord() def ID(self,line): self.unigene_record.ID = self._get_single_entry(line) self.unigene_record.species = self.unigene_record.ID.split('.')[0] def TITLE(self,line): self.unigene_record.title = self._get_single_entry(line) def GENE(self,line): self.unigene_record.symbol = self._get_single_entry(line) def EXPRESS(self,line): self.unigene_record.express = self._get_array_entry(line,split_on='; ') def RESTR_EXPR(self,line): self.unigene_record.restr_expr = self._get_single_entry(line) def GENE_ID(self,line): self.unigene_record.gene_id = self._get_single_entry(line) def CHROMOSOME(self,line): self.unigene_record.chromosome = self._get_single_entry(line) def GENE_ID(self,line): self.unigene_record.gene_id = self._get_single_entry(line) def SEQUENCE(self,line): ug_seqrecord = UnigeneSequenceRecord(self._get_single_entry(line)) self.unigene_record.sequence.append(ug_seqrecord) def PROTSIM(self,line): ug_protsimrecord = UnigeneProtsimRecord(self._get_single_entry(line)) self.unigene_record.protsim.append(ug_protsimrecord) def STS(self,line): ug_stsrecord = UnigeneSTSRecord(self._get_single_entry(line)) self.unigene_record.sts.append(ug_stsrecord) def _get_single_entry(self,line): """Consume a single-value line """ return line[UG_INDENT:] def _get_array_entry(self,line,split_on): """Consume a multi-value line by splitting on split_on """ return line[UG_INDENT:].split(split_on) class _Scanner: """Scans a Unigene Flat File Format file (DEPRECATED). This class is DEPRECATED; please use the read() function in this module instead. """ def __init__(self): import warnings warnings.warn("Bio.UniGene._Scanner is deprecated; please use the read() function in this module instead", DeprecationWarning) def feed(self, handle, consumer): """feed(self, handle, consumer) Feed events from parsing a Unigene file to a consumer. handle is a file-like object, and consumer is a consumer object that will receive events as the file is scanned """ consumer.start_record() for line in handle: tag = line.split(' ')[0] line = line.rstrip() if line=='//': consumer.end_record() break try: f = getattr(consumer, tag) except AttributeError: print 'no method called', tag else: if callable(f): f(line) class RecordParser(AbstractParser): """This class is DEPRECATED; please use the read() function in this module instead.""" def __init__(self): import warnings warnings.warn("Bio.UniGene._RecordParser is deprecated; please use the read() function in this module instead", DeprecationWarning) self._scanner = _Scanner() self._consumer = _RecordConsumer() def parse(self, handle): if isinstance(handle, File.UndoHandle): uhandle = handle else: uhandle = File.UndoHandle(handle) self._scanner.feed(uhandle, self._consumer) return self._consumer.unigene_record class Iterator: """This class is DEPRECATED; please use the parse() function in this module instead.""" def __init__(self, handle, parser=None): import warnings warnings.warn("Bio.UniGene.Iterator is deprecated; please use the parse() function in this module instead", DeprecationWarning) self._uhandle = File.UndoHandle(handle) def next(self): self._parser = RecordParser() lines = [] while True: line = self._uhandle.readline() if not line: break if line[:2] == '//': break lines.append(line) if not lines: return None lines.append('//') data = ''.join(lines) if self._parser is not None: return self._parser.parse(File.StringHandle(data)) return data def __iter__(self): return iter(self.next, None)