# Copyright 1999 by Jeffrey Chang. All rights reserved. # Copyright 2000 by Jeffrey Chang. All rights reserved. # Revisions Copyright 2007 by Peter Cock. All rights reserved. # Revisions Copyright 2009 by Michiel de Hoon. All rights reserved. # This code is part of the Biopython distribution and governed by its # license. Please see the LICENSE file that should have been included # as part of this package. """ This module provides code to work with the prosite dat file from Prosite. http://www.expasy.ch/prosite/ Tested with: Release 20.43, 10-Feb-2009 Functions: read Reads a Prosite file containing one Prosite record parse Iterates over records in a Prosite file. Classes: Record Holds Prosite data. """ def parse(handle): """Parse Prosite records. This function is for parsing Prosite files containing multiple records. handle - handle to the file.""" while True: record = __read(handle) if not record: break yield record def read(handle): """Read one Prosite record. This function is for parsing Prosite files containing exactly one record. handle - handle to the file.""" record = __read(handle) # We should have reached the end of the record by now remainder = handle.read() if remainder: raise ValueError("More than one Prosite record found") return record class Record: """Holds information from a Prosite record. Members: name ID of the record. e.g. ADH_ZINC type Type of entry. e.g. PATTERN, MATRIX, or RULE accession e.g. PS00387 created Date the entry was created. (MMM-YYYY) data_update Date the 'primary' data was last updated. info_update Date data other than 'primary' data was last updated. pdoc ID of the PROSITE DOCumentation. description Free-format description. pattern The PROSITE pattern. See docs. matrix List of strings that describes a matrix entry. rules List of rule definitions (from RU lines). (strings) prorules List of prorules (from PR lines). (strings) NUMERICAL RESULTS nr_sp_release SwissProt release. nr_sp_seqs Number of seqs in that release of Swiss-Prot. (int) nr_total Number of hits in Swiss-Prot. tuple of (hits, seqs) nr_positive True positives. tuple of (hits, seqs) nr_unknown Could be positives. tuple of (hits, seqs) nr_false_pos False positives. tuple of (hits, seqs) nr_false_neg False negatives. (int) nr_partial False negatives, because they are fragments. (int) COMMENTS cc_taxo_range Taxonomic range. See docs for format cc_max_repeat Maximum number of repetitions in a protein cc_site Interesting site. list of tuples (pattern pos, desc.) cc_skip_flag Can this entry be ignored? cc_matrix_type cc_scaling_db cc_author cc_ft_key cc_ft_desc cc_version version number (introduced in release 19.0) DATA BANK REFERENCES - The following are all lists of tuples (swiss-prot accession, swiss-prot name) dr_positive dr_false_neg dr_false_pos dr_potential Potential hits, but fingerprint region not yet available. dr_unknown Could possibly belong pdb_structs List of PDB entries. """ def __init__(self): self.name = '' self.type = '' self.accession = '' self.created = '' self.data_update = '' self.info_update = '' self.pdoc = '' self.description = '' self.pattern = '' self.matrix = [] self.rules = [] self.prorules = [] self.postprocessing = [] self.nr_sp_release = '' self.nr_sp_seqs = '' self.nr_total = (None, None) self.nr_positive = (None, None) self.nr_unknown = (None, None) self.nr_false_pos = (None, None) self.nr_false_neg = None self.nr_partial = None self.cc_taxo_range = '' self.cc_max_repeat = '' self.cc_site = [] self.cc_skip_flag = '' self.dr_positive = [] self.dr_false_neg = [] self.dr_false_pos = [] self.dr_potential = [] self.dr_unknown = [] self.pdb_structs = [] # Everything below are private functions def __read(handle): import re record = None for line in handle: keyword, value = line[:2], line[5:].rstrip() if keyword=='ID': record = Record() cols = value.split("; ") if len(cols) != 2: raise ValueError("I don't understand identification line\n%s" \ % line) record.name = cols[0] record.type = cols[1].rstrip('.') # don't want '.' elif keyword=='AC': record.accession = value.rstrip(';') elif keyword=='DT': dates = value.rstrip('.').split("; ") if (not dates[0].endswith('(CREATED)')) or \ (not dates[1].endswith('(DATA UPDATE)')) or \ (not dates[2].endswith('(INFO UPDATE)')): raise ValueError("I don't understand date line\n%s" % line) record.created = dates[0].rstrip(' (CREATED)') record.data_update = dates[1].rstrip(' (DATA UPDATE)') record.info_update = dates[2].rstrip(' (INFO UPDATE)') elif keyword=='DE': record.description = value elif keyword=='PA': record.pattern += value elif keyword=='MA': record.matrix.append(value) elif keyword=='PP': record.postprocessing.extend(value.split(";")) elif keyword=='RU': record.rules.append(value) elif keyword=='NR': cols = value.split(";") for col in cols: if not col: continue qual, data = [word.lstrip() for word in col.split("=")] if qual == '/RELEASE': release, seqs = data.split(",") record.nr_sp_release = release record.nr_sp_seqs = int(seqs) elif qual == '/FALSE_NEG': record.nr_false_neg = int(data) elif qual == '/PARTIAL': record.nr_partial = int(data) elif qual in ['/TOTAL', '/POSITIVE', '/UNKNOWN', '/FALSE_POS']: m = re.match(r'(\d+)\((\d+)\)', data) if not m: raise Exception("Broken data %s in comment line\n%s" \ % (repr(data), line)) hits = tuple(map(int, m.groups())) if(qual == "/TOTAL"): record.nr_total = hits elif(qual == "/POSITIVE"): record.nr_positive = hits elif(qual == "/UNKNOWN"): record.nr_unknown = hits elif(qual == "/FALSE_POS"): record.nr_false_pos = hits else: raise ValueError("Unknown qual %s in comment line\n%s" \ % (repr(qual), line)) elif keyword=='CC': #Expect CC lines like this: #CC /TAXO-RANGE=??EPV; /MAX-REPEAT=2; #Can (normally) split on ";" and then on "=" cols = value.split(";") for col in cols: if not col or col[:17] == 'Automatic scaling': # DNAJ_2 in Release 15 has a non-standard comment line: # CC Automatic scaling using reversed database # Throw it away. (Should I keep it?) continue if col.count("=") == 0: #Missing qualifier! Can we recover gracefully? #For example, from Bug 2403, in PS50293 have: #CC /AUTHOR=K_Hofmann; N_Hulo continue qual, data = [word.lstrip() for word in col.split("=")] if qual == '/TAXO-RANGE': record.cc_taxo_range = data elif qual == '/MAX-REPEAT': record.cc_max_repeat = data elif qual == '/SITE': pos, desc = data.split(",") record.cc_site.append((int(pos), desc)) elif qual == '/SKIP-FLAG': record.cc_skip_flag = data elif qual == '/MATRIX_TYPE': record.cc_matrix_type = data elif qual == '/SCALING_DB': record.cc_scaling_db = data elif qual == '/AUTHOR': record.cc_author = data elif qual == '/FT_KEY': record.cc_ft_key = data elif qual == '/FT_DESC': record.cc_ft_desc = data elif qual == '/VERSION': record.cc_version = data else: raise ValueError("Unknown qual %s in comment line\n%s" \ % (repr(qual), line)) elif keyword=='DR': refs = value.split(";") for ref in refs: if not ref: continue acc, name, type = [word.strip() for word in ref.split(",")] if type == 'T': record.dr_positive.append((acc, name)) elif type == 'F': record.dr_false_pos.append((acc, name)) elif type == 'N': record.dr_false_neg.append((acc, name)) elif type == 'P': record.dr_potential.append((acc, name)) elif type == '?': record.dr_unknown.append((acc, name)) else: raise ValueError("I don't understand type flag %s" % type) elif keyword=='3D': cols = value.split() for id in cols: record.pdb_structs.append(id.rstrip(';')) elif keyword=='PR': rules = value.split(";") record.prorules.extend(rules) elif keyword=='DO': record.pdoc = value.rstrip(';') elif keyword=='CC': continue elif keyword=='//': if not record: # Then this was the copyright statement continue break else: raise ValueError("Unknown keyword %s found" % keyword) else: return if not record: raise ValueError("Unexpected end of stream.") return record