# Copyright 1999 by Jeffrey Chang. All rights reserved. # This code is part of the Biopython distribution and governed by its # license. Please see the LICENSE file that should have been included # as part of this package. """ This module provides code to work with the sprotXX.dat file from Utilities for working with FASTA-formatted sequences (DEPRECATED). http://www.expasy.ch/sprot/sprot-top.html Please see Bio.SwissProt for alternatives for the functionality in this module. Tested with: Release 37, Release 38, Release 39 Limited testing with: Release 51, 54 Classes: Record Holds SwissProt data. Reference Holds reference data from a SwissProt entry. Iterator Iterates over entries in a SwissProt file. Dictionary Accesses a SwissProt file using a dictionary interface. RecordParser Parses a SwissProt record into a Record object. SequenceParser Parses a SwissProt record into a SeqRecord object. _Scanner Scans SwissProt-formatted data. _RecordConsumer Consumes SwissProt data to a SProt.Record object. _SequenceConsumer Consumes SwissProt data to a SeqRecord object. Functions: index_file Index a SwissProt file for a Dictionary. """ import warnings warnings.warn("Bio.SwissProt.SProt is deprecated. Please use the functions Bio.SwissProt.parse or Bio.SwissProt.read if you want to get a SwissProt.Record, or Bio.SeqIO.parse or Bio.SeqIO.read if you want to get a SeqRecord. If these solutions do not work for you, please get in contact with the Biopython developers (biopython-dev@biopython.org).", DeprecationWarning) from types import * import os from Bio import File from Bio import Index from Bio import Alphabet from Bio import Seq from Bio import SeqRecord from Bio.ParserSupport import * # The parse(), read() functions can probably be simplified if we don't # use the "parser = RecordParser(); parser.parse(handle)" approach. def parse(handle): from SProt import RecordParser import cStringIO parser = RecordParser() text = "" for line in handle: text += line if line[:2]=='//': handle = cStringIO.StringIO(text) record = parser.parse(handle) text = "" yield record def read(handle): from SProt import RecordParser parser = RecordParser() try: record = parser.parse(handle) except ValueError, error: if error.message.startswith("Line does not start with 'ID':"): raise ValueError("No SwissProt record found") else: raise error # We should have reached the end of the record by now remainder = handle.read() if remainder: raise ValueError("More than one SwissProt record found") return record _CHOMP = " \n\r\t.,;" #whitespace and trailing punctuation class Record: """Holds information from a SwissProt record. Members: entry_name Name of this entry, e.g. RL1_ECOLI. data_class Either 'STANDARD' or 'PRELIMINARY'. molecule_type Type of molecule, 'PRT', sequence_length Number of residues. accessions List of the accession numbers, e.g. ['P00321'] created A tuple of (date, release). sequence_update A tuple of (date, release). annotation_update A tuple of (date, release). description Free-format description. gene_name Gene name. See userman.txt for description. organism The source of the sequence. organelle The origin of the sequence. organism_classification The taxonomy classification. List of strings. (http://www.ncbi.nlm.nih.gov/Taxonomy/) taxonomy_id A list of NCBI taxonomy id's. host_organism A list of NCBI taxonomy id's of the hosts of a virus, if any. references List of Reference objects. comments List of strings. cross_references List of tuples (db, id1[, id2][, id3]). See the docs. keywords List of the keywords. features List of tuples (key name, from, to, description). from and to can be either integers for the residue numbers, '<', '>', or '?' seqinfo tuple of (length, molecular weight, CRC32 value) sequence The sequence. """ def __init__(self): self.entry_name = None self.data_class = None self.molecule_type = None self.sequence_length = None self.accessions = [] self.created = None self.sequence_update = None self.annotation_update = None self.description = '' self.gene_name = '' self.organism = '' self.organelle = '' self.organism_classification = [] self.taxonomy_id = [] self.host_organism = [] self.references = [] self.comments = [] self.cross_references = [] self.keywords = [] self.features = [] self.seqinfo = None self.sequence = '' class Reference: """Holds information from 1 references in a SwissProt entry. Members: number Number of reference in an entry. positions Describes extent of work. list of strings. comments Comments. List of (token, text). references References. List of (dbname, identifier) authors The authors of the work. title Title of the work. location A citation for the work. """ def __init__(self): self.number = None self.positions = [] self.comments = [] self.references = [] self.authors = '' self.title = '' self.location = '' class Dictionary: """Accesses a SwissProt file using a dictionary interface. """ __filename_key = '__filename' def __init__(self, indexname, parser=None): """__init__(self, indexname, parser=None) Open a SwissProt Dictionary. indexname is the name of the index for the dictionary. The index should have been created using the index_file function. parser is an optional Parser object to change the results into another form. If set to None, then the raw contents of the file will be returned. """ self._index = Index.Index(indexname) self._handle = open(self._index[self.__filename_key]) self._parser = parser def __len__(self): return len(self._index) def __getitem__(self, key): start, len = self._index[key] self._handle.seek(start) data = self._handle.read(len) if self._parser is not None: return self._parser.parse(File.StringHandle(data)) return data def __getattr__(self, name): return getattr(self._index, name) def keys(self): # I only want to expose the keys for SwissProt. k = self._index.keys() k.remove(self.__filename_key) return k class RecordParser(AbstractParser): """Parses SwissProt data into a Record object. """ def __init__(self): self._scanner = _Scanner() self._consumer = _RecordConsumer() def parse(self, handle): self._scanner.feed(handle, self._consumer) return self._consumer.data class SequenceParser(AbstractParser): """Parses SwissProt data into a standard SeqRecord object. """ def __init__(self, alphabet = Alphabet.generic_protein): """Initialize a SequenceParser. Arguments: o alphabet - The alphabet to use for the generated Seq objects. If not supplied this will default to the generic protein alphabet. """ self._scanner = _Scanner() self._consumer = _SequenceConsumer(alphabet) def parse(self, handle): self._scanner.feed(handle, self._consumer) return self._consumer.data class _Scanner: """Scans SwissProt-formatted data. Tested with: Release 37 Release 38 """ def feed(self, handle, consumer): """feed(self, handle, consumer) Feed in SwissProt data for scanning. handle is a file-like object that contains swissprot data. consumer is a Consumer object that will receive events as the report is scanned. """ if isinstance(handle, File.UndoHandle): uhandle = handle else: uhandle = File.UndoHandle(handle) self._scan_record(uhandle, consumer) def _skip_starstar(self, uhandle): """Ignores any lines starting **""" #See Bug 2353, some files from the EBI have extra lines #starting "**" (two asterisks/stars), usually between the #features and sequence but not all the time. They appear #to be unofficial automated annotations. e.g. #** #** ################# INTERNAL SECTION ################## #**HA SAM; Annotated by PicoHamap 1.88; MF_01138.1; 09-NOV-2003. while "**" == uhandle.peekline()[:2]: skip = uhandle.readline() #print "Skipping line: %s" % skip.rstrip() def _scan_record(self, uhandle, consumer): consumer.start_record() for fn in self._scan_fns: self._skip_starstar(uhandle) fn(self, uhandle, consumer) # In Release 38, ID N33_HUMAN has a DR buried within comments. # Check for this and do more comments, if necessary. # XXX handle this better if fn is self._scan_dr.im_func: self._scan_cc(uhandle, consumer) self._scan_dr(uhandle, consumer) consumer.end_record() def _scan_line(self, line_type, uhandle, event_fn, exactly_one=None, one_or_more=None, any_number=None, up_to_one=None): # Callers must set exactly one of exactly_one, one_or_more, or # any_number to a true value. I do not explicitly check to # make sure this function is called correctly. # This does not guarantee any parameter safety, but I # like the readability. The other strategy I tried was have # parameters min_lines, max_lines. if exactly_one or one_or_more: read_and_call(uhandle, event_fn, start=line_type) if one_or_more or any_number: while 1: if not attempt_read_and_call(uhandle, event_fn, start=line_type): break if up_to_one: attempt_read_and_call(uhandle, event_fn, start=line_type) def _scan_id(self, uhandle, consumer): self._scan_line('ID', uhandle, consumer.identification, exactly_one=1) def _scan_ac(self, uhandle, consumer): # Until release 38, this used to match exactly_one. # However, in release 39, 1A02_HUMAN has 2 AC lines, and the # definition needed to be expanded. self._scan_line('AC', uhandle, consumer.accession, any_number=1) def _scan_dt(self, uhandle, consumer): self._scan_line('DT', uhandle, consumer.date, exactly_one=1) self._scan_line('DT', uhandle, consumer.date, exactly_one=1) # IPI doesn't necessarily contain the third line about annotations self._scan_line('DT', uhandle, consumer.date, up_to_one=1) def _scan_de(self, uhandle, consumer): # IPI can be missing a DE line self._scan_line('DE', uhandle, consumer.description, any_number=1) def _scan_gn(self, uhandle, consumer): self._scan_line('GN', uhandle, consumer.gene_name, any_number=1) def _scan_os(self, uhandle, consumer): self._scan_line('OS', uhandle, consumer.organism_species, one_or_more=1) def _scan_og(self, uhandle, consumer): self._scan_line('OG', uhandle, consumer.organelle, any_number=1) def _scan_oc(self, uhandle, consumer): self._scan_line('OC', uhandle, consumer.organism_classification, one_or_more=1) def _scan_ox(self, uhandle, consumer): self._scan_line('OX', uhandle, consumer.taxonomy_id, any_number=1) def _scan_oh(self, uhandle, consumer): # viral host organism. introduced after SwissProt 39. self._scan_line('OH', uhandle, consumer.organism_host, any_number=1) def _scan_reference(self, uhandle, consumer): while True: if safe_peekline(uhandle)[:2] != 'RN': break self._scan_rn(uhandle, consumer) self._scan_rp(uhandle, consumer) self._scan_rc(uhandle, consumer) self._scan_rx(uhandle, consumer) # ws:2001-12-05 added, for record with RL before RA self._scan_rl(uhandle, consumer) self._scan_ra(uhandle, consumer) #EBI copy of P72010 is missing the RT line, and has one #of their ** lines in its place noting "** /NO TITLE." #See also bug 2353 self._skip_starstar(uhandle) self._scan_rt(uhandle, consumer) self._scan_rl(uhandle, consumer) def _scan_rn(self, uhandle, consumer): self._scan_line('RN', uhandle, consumer.reference_number, exactly_one=1) def _scan_rp(self, uhandle, consumer): self._scan_line('RP', uhandle, consumer.reference_position, one_or_more=1) def _scan_rc(self, uhandle, consumer): self._scan_line('RC', uhandle, consumer.reference_comment, any_number=1) def _scan_rx(self, uhandle, consumer): self._scan_line('RX', uhandle, consumer.reference_cross_reference, any_number=1) def _scan_ra(self, uhandle, consumer): # In UniProt release 1.12 of 6/21/04, there is a new RG # (Reference Group) line, which references a group instead of # an author. Each block must have at least 1 RA or RG line. self._scan_line('RA', uhandle, consumer.reference_author, any_number=1) self._scan_line('RG', uhandle, consumer.reference_author, any_number=1) # PRKN_HUMAN has RG lines, then RA lines. The best solution # is to write code that accepts either of the line types. # This is the quick solution... self._scan_line('RA', uhandle, consumer.reference_author, any_number=1) def _scan_rt(self, uhandle, consumer): self._scan_line('RT', uhandle, consumer.reference_title, any_number=1) def _scan_rl(self, uhandle, consumer): # This was one_or_more, but P82909 in TrEMBL 16.0 does not # have one. self._scan_line('RL', uhandle, consumer.reference_location, any_number=1) def _scan_cc(self, uhandle, consumer): self._scan_line('CC', uhandle, consumer.comment, any_number=1) def _scan_dr(self, uhandle, consumer): self._scan_line('DR', uhandle, consumer.database_cross_reference, any_number=1) def _scan_kw(self, uhandle, consumer): self._scan_line('KW', uhandle, consumer.keyword, any_number=1) def _scan_ft(self, uhandle, consumer): self._scan_line('FT', uhandle, consumer.feature_table, any_number=1) def _scan_pe(self, uhandle, consumer): self._scan_line('PE', uhandle, consumer.protein_existence, any_number=1) def _scan_sq(self, uhandle, consumer): self._scan_line('SQ', uhandle, consumer.sequence_header, exactly_one=1) def _scan_sequence_data(self, uhandle, consumer): self._scan_line(' ', uhandle, consumer.sequence_data, one_or_more=1) def _scan_terminator(self, uhandle, consumer): self._scan_line('//', uhandle, consumer.terminator, exactly_one=1) _scan_fns = [ _scan_id, _scan_ac, _scan_dt, _scan_de, _scan_gn, _scan_os, _scan_og, _scan_oc, _scan_ox, _scan_oh, _scan_reference, _scan_cc, _scan_dr, _scan_pe, _scan_kw, _scan_ft, _scan_sq, _scan_sequence_data, _scan_terminator ] class _RecordConsumer(AbstractConsumer): """Consumer that converts a SwissProt record to a Record object. Members: data Record with SwissProt data. """ def __init__(self): self.data = None def __repr__(self): return "Bio.SwissProt.SProt._RecordConsumer()" def start_record(self): self.data = Record() self._sequence_lines = [] def end_record(self): self._clean_record(self.data) self.data.sequence = "".join(self._sequence_lines) def identification(self, line): cols = line.split() #Prior to release 51, included with MoleculeType: #ID EntryName DataClass; MoleculeType; SequenceLength. # #Newer files lack the MoleculeType: #ID EntryName DataClass; SequenceLength. # #Note that cols is split on white space, so the length #should become two fields (number and units) if len(cols) == 6: self.data.entry_name = cols[1] self.data.data_class = cols[2].rstrip(_CHOMP) # don't want ';' self.data.molecule_type = cols[3].rstrip(_CHOMP) # don't want ';' self.data.sequence_length = int(cols[4]) elif len(cols) == 5: self.data.entry_name = cols[1] self.data.data_class = cols[2].rstrip(_CHOMP) # don't want ';' self.data.molecule_type = None self.data.sequence_length = int(cols[3]) else: #Should we print a warning an continue? raise ValueError("ID line has unrecognised format:\n"+line) # data class can be 'STANDARD' or 'PRELIMINARY' # ws:2001-12-05 added IPI # pjc:2006-11-02 added 'Reviewed' and 'Unreviewed' if self.data.data_class not in ['STANDARD', 'PRELIMINARY', 'IPI', 'Reviewed', 'Unreviewed']: raise ValueError("Unrecognized data class %s in line\n%s" % \ (self.data.data_class, line)) # molecule_type should be 'PRT' for PRoTein # Note that has been removed in recent releases (set to None) if self.data.molecule_type is not None \ and self.data.molecule_type != 'PRT': raise ValueError("Unrecognized molecule type %s in line\n%s" % \ (self.data.molecule_type, line)) def accession(self, line): cols = line[5:].rstrip(_CHOMP).strip().split(';') for ac in cols: if ac.strip(): #remove any leading or trailing white space self.data.accessions.append(ac.strip()) def date(self, line): uprline = line.upper() cols = line.rstrip().split() if uprline.find('CREATED') >= 0 \ or uprline.find('LAST SEQUENCE UPDATE') >= 0 \ or uprline.find('LAST ANNOTATION UPDATE') >= 0: # Old style DT line # ================= # e.g. # DT 01-FEB-1995 (Rel. 31, Created) # DT 01-FEB-1995 (Rel. 31, Last sequence update) # DT 01-OCT-2000 (Rel. 40, Last annotation update) # # or: # DT 08-JAN-2002 (IPI Human rel. 2.3, Created) # ... # find where the version information will be located # This is needed for when you have cases like IPI where # the release verison is in a different spot: # DT 08-JAN-2002 (IPI Human rel. 2.3, Created) uprcols = uprline.split() rel_index = -1 for index in range(len(uprcols)): if uprcols[index].find("REL.") >= 0: rel_index = index assert rel_index >= 0, \ "Could not find Rel. in DT line: %s" % (line) version_index = rel_index + 1 # get the version information str_version = cols[version_index].rstrip(_CHOMP) # no version number if str_version == '': version = 0 # dot versioned elif str_version.find(".") >= 0: version = str_version # integer versioned else: version = int(str_version) if uprline.find('CREATED') >= 0: self.data.created = cols[1], version elif uprline.find('LAST SEQUENCE UPDATE') >= 0: self.data.sequence_update = cols[1], version elif uprline.find( 'LAST ANNOTATION UPDATE') >= 0: self.data.annotation_update = cols[1], version else: assert False, "Shouldn't reach this line!" elif uprline.find('INTEGRATED INTO') >= 0 \ or uprline.find('SEQUENCE VERSION') >= 0 \ or uprline.find('ENTRY VERSION') >= 0: # New style DT line # ================= # As of UniProt Knowledgebase release 7.0 (including # Swiss-Prot release 49.0 and TrEMBL release 32.0) the # format of the DT lines and the version information # in them was changed - the release number was dropped. # # For more information see bug 1948 and # http://ca.expasy.org/sprot/relnotes/sp_news.html#rel7.0 # # e.g. # DT 01-JAN-1998, integrated into UniProtKB/Swiss-Prot. # DT 15-OCT-2001, sequence version 3. # DT 01-APR-2004, entry version 14. # #This is a new style DT line... # The date should be in string cols[1] # Get the version number if there is one. # For the three DT lines above: 0, 3, 14 try: version = int(cols[-1]) except ValueError: version = 0 # Re-use the historical property names, even though # the meaning has changed slighty: if uprline.find("INTEGRATED") >= 0: self.data.created = cols[1], version elif uprline.find('SEQUENCE VERSION') >= 0: self.data.sequence_update = cols[1], version elif uprline.find( 'ENTRY VERSION') >= 0: self.data.annotation_update = cols[1], version else: assert False, "Shouldn't reach this line!" else: raise ValueError("I don't understand the date line %s" % line) def description(self, line): self.data.description += line[5:].strip() + " " def gene_name(self, line): self.data.gene_name += line[5:] def organism_species(self, line): self.data.organism += line[5:] def organelle(self, line): self.data.organelle += line[5:] def organism_classification(self, line): line = line[5:].rstrip(_CHOMP) cols = line.split(';') for col in cols: self.data.organism_classification.append(col.lstrip()) def taxonomy_id(self, line): # The OX line is in the format: # OX DESCRIPTION=ID[, ID]...; # If there are too many id's to fit onto a line, then the ID's # continue directly onto the next line, e.g. # OX DESCRIPTION=ID[, ID]... # OX ID[, ID]...; # Currently, the description is always "NCBI_TaxID". # To parse this, I need to check to see whether I'm at the # first line. If I am, grab the description and make sure # it's an NCBI ID. Then, grab all the id's. line = line[5:].rstrip(_CHOMP) index = line.find('=') if index >= 0: descr = line[:index] assert descr == "NCBI_TaxID", "Unexpected taxonomy type %s" % descr ids = line[index+1:].split(',') else: ids = line.split(',') self.data.taxonomy_id.extend([id.strip() for id in ids]) def organism_host(self, line): # Line type OH (Organism Host) for viral hosts # same code as in taxonomy_id() line = line[5:].rstrip(_CHOMP) index = line.find('=') if index >= 0: descr = line[:index] assert descr == "NCBI_TaxID", "Unexpected taxonomy type %s" % descr ids = line[index+1:].split(',') else: ids = line.split(',') self.data.host_organism.extend([id.strip() for id in ids]) def reference_number(self, line): rn = line[5:].rstrip() assert rn[0] == '[' and rn[-1] == ']', "Missing brackets %s" % rn ref = Reference() ref.number = int(rn[1:-1]) self.data.references.append(ref) def reference_position(self, line): assert self.data.references, "RP: missing RN" self.data.references[-1].positions.append(line[5:].rstrip()) def reference_comment(self, line): assert self.data.references, "RC: missing RN" cols = line[5:].rstrip().split( ';') ref = self.data.references[-1] for col in cols: if not col: # last column will be the empty string continue # The token is everything before the first '=' character. index = col.find('=') token, text = col[:index], col[index+1:] # According to the spec, there should only be 1 '=' # character. However, there are too many exceptions to # handle, so we'll ease up and allow anything after the # first '='. #if col == ' STRAIN=TISSUE=BRAIN': # # from CSP_MOUSE, release 38 # token, text = "TISSUE", "BRAIN" #elif col == ' STRAIN=NCIB 9816-4, AND STRAIN=G7 / ATCC 17485': # # from NDOA_PSEPU, release 38 # token, text = "STRAIN", "NCIB 9816-4 AND G7 / ATCC 17485" #elif col == ' STRAIN=ISOLATE=NO 27, ANNO 1987' or \ # col == ' STRAIN=ISOLATE=NO 27 / ANNO 1987': # # from NU3M_BALPH, release 38, release 39 # token, text = "STRAIN", "ISOLATE NO 27, ANNO 1987" #else: # token, text = string.split(col, '=') ref.comments.append((token.lstrip(), text)) def reference_cross_reference(self, line): assert self.data.references, "RX: missing RN" # The basic (older?) RX line is of the form: # RX MEDLINE; 85132727. # but there are variants of this that need to be dealt with (see below) # CLD1_HUMAN in Release 39 and DADR_DIDMA in Release 33 # have extraneous information in the RX line. Check for # this and chop it out of the line. # (noticed by katel@worldpath.net) ind = line.find('[NCBI, ExPASy, Israel, Japan]') if ind >= 0: line = line[:ind] # RX lines can also be used of the form # RX PubMed=9603189; # reported by edvard@farmasi.uit.no # and these can be more complicated like: # RX MEDLINE=95385798; PubMed=7656980; # RX PubMed=15060122; DOI=10.1136/jmg 2003.012781; # We look for these cases first and deal with them if line.find("=") != -1: cols = line[2:].split("; ") cols = [x.strip() for x in cols] cols = [x for x in cols if x] for col in cols: x = col.split("=") assert len(x) == 2, "I don't understand RX line %s" % line key, value = x[0].rstrip(_CHOMP), x[1].rstrip(_CHOMP) ref = self.data.references[-1].references ref.append((key, value)) # otherwise we assume we have the type 'RX MEDLINE; 85132727.' else: cols = line.split() # normally we split into the three parts assert len(cols) == 3, "I don't understand RX line %s" % line self.data.references[-1].references.append( (cols[1].rstrip(_CHOMP), cols[2].rstrip(_CHOMP))) def reference_author(self, line): assert self.data.references, "RA: missing RN" ref = self.data.references[-1] ref.authors += line[5:] def reference_title(self, line): assert self.data.references, "RT: missing RN" ref = self.data.references[-1] ref.title += line[5:] def reference_location(self, line): assert self.data.references, "RL: missing RN" ref = self.data.references[-1] ref.location += line[5:] def comment(self, line): if line[5:8] == '-!-': # Make a new comment self.data.comments.append(line[9:]) elif line[5:8] == ' ': # add to the previous comment if not self.data.comments: # TCMO_STRGA in Release 37 has comment with no topic self.data.comments.append(line[9:]) else: self.data.comments[-1] += line[9:] elif line[5:8] == '---': # If there are no comments, and it's not the closing line, # make a new comment. if not self.data.comments or self.data.comments[-1][:3] != '---': self.data.comments.append(line[5:]) else: self.data.comments[-1] += line[5:] else: # copyright notice self.data.comments[-1] += line[5:] def database_cross_reference(self, line): # From CLD1_HUMAN, Release 39: # DR EMBL; [snip]; -. [EMBL / GenBank / DDBJ] [CoDingSequence] # DR PRODOM [Domain structure / List of seq. sharing at least 1 domai # DR SWISS-2DPAGE; GET REGION ON 2D PAGE. line = line[5:] # Remove the comments at the end of the line i = line.find('[') if i >= 0: line = line[:i] cols = line.rstrip(_CHOMP).split(';') cols = [col.lstrip() for col in cols] self.data.cross_references.append(tuple(cols)) def keyword(self, line): cols = line[5:].rstrip(_CHOMP).split(';') self.data.keywords.extend([c.lstrip() for c in cols]) def feature_table(self, line): line = line[5:] # get rid of junk in front name = line[0:8].rstrip() try: from_res = int(line[9:15]) except ValueError: from_res = line[9:15].lstrip() try: to_res = int(line[16:22]) except ValueError: to_res = line[16:22].lstrip() description = line[29:70].rstrip() #if there is a feature_id (FTId), store it away if line[29:35]==r"/FTId=": ft_id = line[35:70].rstrip()[:-1] else: ft_id ="" if not name: # is continuation of last one assert not from_res and not to_res name, from_res, to_res, old_description,old_ft_id = self.data.features[-1] del self.data.features[-1] description = "%s %s" % (old_description, description) # special case -- VARSPLIC, reported by edvard@farmasi.uit.no if name == "VARSPLIC": description = self._fix_varsplic_sequences(description) self.data.features.append((name, from_res, to_res, description,ft_id)) def _fix_varsplic_sequences(self, description): """Remove unwanted spaces in sequences. During line carryover, the sequences in VARSPLIC can get mangled with unwanted spaces like: 'DISSTKLQALPSHGLESIQT -> PCRATGWSPFRRSSPC LPTH' We want to check for this case and correct it as it happens. """ descr_cols = description.split(" -> ") if len(descr_cols) == 2: first_seq = descr_cols[0] second_seq = descr_cols[1] extra_info = '' # we might have more information at the end of the # second sequence, which should be in parenthesis extra_info_pos = second_seq.find(" (") if extra_info_pos != -1: extra_info = second_seq[extra_info_pos:] second_seq = second_seq[:extra_info_pos] # now clean spaces out of the first and second string first_seq = first_seq.replace(" ", "") second_seq = second_seq.replace(" ", "") # reassemble the description description = first_seq + " -> " + second_seq + extra_info return description def protein_existence(self, line): #TODO - Record this information? pass def sequence_header(self, line): cols = line.split() assert len(cols) == 8, "I don't understand SQ line %s" % line # Do more checking here? self.data.seqinfo = int(cols[2]), int(cols[4]), cols[6] def sequence_data(self, line): #It should be faster to make a list of strings, and join them at the end. self._sequence_lines.append(line.replace(" ", "").rstrip()) def terminator(self, line): pass #def _clean(self, line, rstrip=1): # if rstrip: # return string.rstrip(line[5:]) # return line[5:] def _clean_record(self, rec): # Remove trailing newlines members = ['description', 'gene_name', 'organism', 'organelle'] for m in members: attr = getattr(rec, m) setattr(rec, m, attr.rstrip()) for ref in rec.references: self._clean_references(ref) def _clean_references(self, ref): # Remove trailing newlines members = ['authors', 'title', 'location'] for m in members: attr = getattr(ref, m) setattr(ref, m, attr.rstrip()) class _SequenceConsumer(AbstractConsumer): """Consumer that converts a SwissProt record to a SeqRecord object. Members: data Record with SwissProt data. alphabet The alphabet the generated Seq objects will have. """ #TODO - Cope with references as done for GenBank def __init__(self, alphabet = Alphabet.generic_protein): """Initialize a Sequence Consumer Arguments: o alphabet - The alphabet to use for the generated Seq objects. If not supplied this will default to the generic protein alphabet. """ self.data = None self.alphabet = alphabet def start_record(self): seq = Seq.Seq("", self.alphabet) self.data = SeqRecord.SeqRecord(seq) self.data.description = "" self.data.name = "" self._current_ref = None self._sequence_lines = [] def end_record(self): if self._current_ref is not None: self.data.annotations['references'].append(self._current_ref) self._current_ref = None self.data.description = self.data.description.rstrip() self.data.seq = Seq.Seq("".join(self._sequence_lines), self.alphabet) self.data.annotations['organism'] = self.data.annotations['organism'].rstrip(_CHOMP) def identification(self, line): cols = line.split() self.data.name = cols[1] def accession(self, line): #Note that files can and often do contain multiple AC lines. ids = line[5:].strip().split(';') #Remove any white space ids = [x.strip() for x in ids if x.strip()] #Use the first as the ID, but record them ALL in the annotations try: self.data.annotations['accessions'].extend(ids) except KeyError: self.data.annotations['accessions'] = ids #Use the FIRST accession as the ID, not the first on this line! self.data.id = self.data.annotations['accessions'][0] #self.data.id = ids[0] def description(self, line): self.data.description += line[5:].strip() + " " def sequence_data(self, line): #It should be faster to make a list of strings, and join them at the end. self._sequence_lines.append(line.replace(" ", "").rstrip()) def gene_name(self, line): #We already store the identification/accession as the records name/id try: self.data.annotations['gene_name'] += " " + line[5:].rstrip() except KeyError: self.data.annotations['gene_name'] = line[5:].rstrip() def comment(self, line): #Try and agree with SeqRecord convention from the GenBank parser, #which stores the comments as a long string with newlines #with key 'comment' #TODO - Follow SwissProt conventions more closely? prefix = line[5:8] text = line[9:].rstrip() if prefix == '-!-': # Make a new comment try: self.data.annotations['comment'] += "\n" + text except KeyError: self.data.annotations['comment'] = text elif prefix == ' ': try: # add to the previous comment self.data.annotations['comment'] += " " + text except KeyError: # TCMO_STRGA in Release 37 has comment with no topic self.data.annotations['comment'] = text def database_cross_reference(self, line): #Format of the line is described in the manual dated 04-Dec-2007 as: #DR DATABASE; PRIMARY; SECONDARY[; TERTIARY][; QUATERNARY]. #However, some older files only seem to have a single identifier: #DR DATABASE; PRIMARY. # #Also must cope with things like this from Tests/SwissProt/sp007, #DR PRODOM [Domain structure / List of seq. sharing at least 1 domain] # #Store these in the dbxref list, but for consistency with #the GenBank parser and with what BioSQL can cope with, #store only DATABASE_IDENTIFIER:PRIMARY_IDENTIFIER parts = [x.strip() for x in line[5:].strip(_CHOMP).split(";")] if len(parts) > 1: value = "%s:%s" % (parts[0], parts[1]) #Avoid duplicate entries if value not in self.data.dbxrefs: self.data.dbxrefs.append(value) #else: #print "Bad DR line:\n%s" % line def date(self, line): date_str = line.split()[1] uprline = line.upper() if uprline.find('CREATED') >= 0: #Try and agree with SeqRecord convention from the GenBank parser, #which stores the submitted date as 'date' self.data.annotations['date'] = date_str elif uprline.find('LAST SEQUENCE UPDATE') >= 0: #There is no existing convention from the GenBank SeqRecord parser self.data.annotations['date_last_sequence_update'] = date_str elif uprline.find('LAST ANNOTATION UPDATE') >= 0: #There is no existing convention from the GenBank SeqRecord parser self.data.annotations['date_last_annotation_update'] = date_str elif uprline.find('INTEGRATED INTO') >= 0: self.data.annotations['date'] = date_str.rstrip(",") elif uprline.find('SEQUENCE VERSION') >= 0: self.data.annotations['date_last_sequence_update'] = date_str.rstrip(",") elif uprline.find('ENTRY VERSION') >= 0: self.data.annotations['date_last_annotation_update'] = date_str.rstrip(",") def keyword(self, line): #Try and agree with SeqRecord convention from the GenBank parser, #which stores a list as 'keywords' cols = line[5:].rstrip(_CHOMP).split(';') cols = [c.strip() for c in cols] cols = filter(None, cols) try: #Extend any existing list of keywords self.data.annotations['keywords'].extend(cols) except KeyError: #Create the list of keywords self.data.annotations['keywords'] = cols def organism_species(self, line): #Try and agree with SeqRecord convention from the GenBank parser, #which stores the organism as a string with key 'organism' data = line[5:].rstrip() try: #Append to any existing data split over multiple lines self.data.annotations['organism'] += " " + data except KeyError: self.data.annotations['organism'] = data def organism_host(self, line): #There is no SeqRecord convention from the GenBank parser, data = line[5:].rstrip(_CHOMP) index = data.find('=') if index >= 0: descr = data[:index] assert descr == "NCBI_TaxID", "Unexpected taxonomy type %s" % descr ids = data[index+1:].split(',') else: ids = data.split(',') try: #Append to any existing data self.data.annotations['organism_host'].extend(ids) except KeyError: self.data.annotations['organism_host'] = ids def organism_classification(self, line): #Try and agree with SeqRecord convention from the GenBank parser, #which stores this taxonomy lineage ese as a list of strings with #key 'taxonomy'. #Note that 'ncbi_taxid' is used for the taxonomy ID (line OX) line = line[5:].rstrip(_CHOMP) cols = [col.strip() for col in line.split(';')] try: #Append to any existing data self.data.annotations['taxonomy'].extend(cols) except KeyError: self.data.annotations['taxonomy'] = cols def taxonomy_id(self, line): #Try and agree with SeqRecord convention expected in BioSQL #the NCBI taxon id with key 'ncbi_taxid'. #Note that 'taxonomy' is used for the taxonomy lineage #(held as a list of strings, line type OC) line = line[5:].rstrip(_CHOMP) index = line.find('=') if index >= 0: descr = line[:index] assert descr == "NCBI_TaxID", "Unexpected taxonomy type %s" % descr ids = line[index+1:].split(',') else: ids = line.split(',') try: #Append to any existing data self.data.annotations['ncbi_taxid'].extend(ids) except KeyError: self.data.annotations['ncbi_taxid'] = ids def reference_number(self, line): """RN line, reference number (start of new reference).""" from Bio.SeqFeature import Reference # if we have a current reference that hasn't been added to # the list of references, add it. if self._current_ref is not None: self.data.annotations['references'].append(self._current_ref) else: self.data.annotations['references'] = [] self._current_ref = Reference() def reference_position(self, line): """RP line, reference position.""" assert self._current_ref is not None, "RP: missing RN" #Should try and store this in self._current_ref.location #but the SwissProt locations don't match easily to the #format used in GenBank... pass def reference_cross_reference(self, line): """RX line, reference cross-references.""" assert self._current_ref is not None, "RX: missing RN" # The basic (older?) RX line is of the form: # RX MEDLINE; 85132727. # or more recently: # RX MEDLINE=95385798; PubMed=7656980; # RX PubMed=15060122; DOI=10.1136/jmg 2003.012781; # We look for these cases first and deal with them if line.find("=") != -1: cols = line[2:].split("; ") cols = [x.strip() for x in cols] cols = [x for x in cols if x] for col in cols: x = col.split("=") assert len(x) == 2, "I don't understand RX line %s" % line key, value = x[0].rstrip(_CHOMP), x[1].rstrip(_CHOMP) if key == "MEDLINE": self._current_ref.medline_id = value elif key == "PubMed": self._current_ref.pubmed_id = value else: #Sadly the SeqFeature.Reference object doesn't #support anything else (yet) pass # otherwise we assume we have the type 'RX MEDLINE; 85132727.' else: # CLD1_HUMAN in Release 39 and DADR_DIDMA in Release 33 # have extraneous information in the RX line. Check for # this and chop it out of the line. # (noticed by katel@worldpath.net) ind = line.find('[NCBI, ExPASy, Israel, Japan]') if ind >= 0: line = line[:ind] cols = line.split() # normally we split into the three parts assert len(cols) == 3, "I don't understand RX line %s" % line key = cols[1].rstrip(_CHOMP) value = cols[2].rstrip(_CHOMP) if key == "MEDLINE": self._current_ref.medline_id = value elif key == "PubMed": self._current_ref.pubmed_id = value else: #Sadly the SeqFeature.Reference object doesn't #support anything else (yet) pass def reference_author(self, line): """RA line, reference author(s).""" assert self._current_ref is not None, "RA: missing RN" if self._current_ref.authors: self._current_ref.authors += " " self._current_ref.authors += line[5:].rstrip("\n") def reference_title(self, line): """RT line, reference title.""" assert self._current_ref is not None, "RT: missing RN" if self._current_ref.title: self._current_ref.title += " " self._current_ref.title += line[5:].rstrip("\n") def reference_location(self, line): """RL line, reference 'location' - journal, volume, pages, year.""" assert self._current_ref is not None, "RL: missing RN" if self._current_ref.journal: self._current_ref.journal += " " self._current_ref.journal += line[5:].rstrip("\n") def reference_comment(self, line): """RC line, reference comment.""" assert self._current_ref is not None, "RC: missing RN" #This has a key=value; structure... #Can we do a better job with the current Reference class? if self._current_ref.comment: self._current_ref.comment += " " self._current_ref.comment += line[5:].rstrip("\n") def index_file(filename, indexname, rec2key=None): """index_file(filename, indexname, rec2key=None) Index a SwissProt file. filename is the name of the file. indexname is the name of the dictionary. rec2key is an optional callback that takes a Record and generates a unique key (e.g. the accession number) for the record. If not specified, the entry name will be used. """ from Bio.SwissProt import parse if not os.path.exists(filename): raise ValueError("%s does not exist" % filename) index = Index.Index(indexname, truncate=1) index[Dictionary._Dictionary__filename_key] = filename handle = open(filename) records = parse(handle) end = 0L for record in records: start = end end = handle.tell() length = end - start if rec2key is not None: key = rec2key(record) else: key = record.entry_name if not key: raise KeyError("empty sequence key was produced") elif key in index: raise KeyError("duplicate key %s found" % key) index[key] = start, length