# Copyright 1999 by Jeffrey Chang. All rights reserved. # Copyright 2000 by Jeffrey Chang. All rights reserved. # Revisions Copyright 2007 by Peter Cock. All rights reserved. # Revisions Copyright 2009 by Michiel de Hoon. All rights reserved. # This code is part of the Biopython distribution and governed by its # license. Please see the LICENSE file that should have been included # as part of this package. """Parser for the prosite dat file from Prosite at ExPASy. See https://www.expasy.org/prosite/ Tested with: - Release 20.43, 10-Feb-2009 - Release 2017_03 of 15-Mar-2017. Functions: - read Reads a Prosite file containing one Prosite record - parse Iterates over records in a Prosite file. Classes: - Record Holds Prosite data. """ def parse(handle): """Parse Prosite records. This function is for parsing Prosite files containing multiple records. Arguments: - handle - handle to the file. """ while True: record = __read(handle) if not record: break yield record def read(handle): """Read one Prosite record. This function is for parsing Prosite files containing exactly one record. Arguments: - handle - handle to the file. """ record = __read(handle) # We should have reached the end of the record by now remainder = handle.read() if remainder: raise ValueError("More than one Prosite record found") return record class Record(object): """Holds information from a Prosite record. Main attributes: - name ID of the record. e.g. ADH_ZINC - type Type of entry. e.g. PATTERN, MATRIX, or RULE - accession e.g. PS00387 - created Date the entry was created. (MMM-YYYY for releases before January 2017, DD-MMM-YYYY since January 2017) - data_update Date the 'primary' data was last updated. - info_update Date data other than 'primary' data was last updated. - pdoc ID of the PROSITE DOCumentation. - description Free-format description. - pattern The PROSITE pattern. See docs. - matrix List of strings that describes a matrix entry. - rules List of rule definitions (from RU lines). (strings) - prorules List of prorules (from PR lines). (strings) NUMERICAL RESULTS: - nr_sp_release SwissProt release. - nr_sp_seqs Number of seqs in that release of Swiss-Prot. (int) - nr_total Number of hits in Swiss-Prot. tuple of (hits, seqs) - nr_positive True positives. tuple of (hits, seqs) - nr_unknown Could be positives. tuple of (hits, seqs) - nr_false_pos False positives. tuple of (hits, seqs) - nr_false_neg False negatives. (int) - nr_partial False negatives, because they are fragments. (int) COMMENTS: - cc_taxo_range Taxonomic range. See docs for format - cc_max_repeat Maximum number of repetitions in a protein - cc_site Interesting site. list of tuples (pattern pos, desc.) - cc_skip_flag Can this entry be ignored? - cc_matrix_type - cc_scaling_db - cc_author - cc_ft_key - cc_ft_desc - cc_version version number (introduced in release 19.0) The following are all lists if tuples (swiss-prot accession, swiss-prot name). DATA BANK REFERENCES: - dr_positive - dr_false_neg - dr_false_pos - dr_potential Potential hits, but fingerprint region not yet available. - dr_unknown Could possibly belong - pdb_structs List of PDB entries. """ def __init__(self): """Initialize the class.""" self.name = "" self.type = "" self.accession = "" self.created = "" self.data_update = "" self.info_update = "" self.pdoc = "" self.description = "" self.pattern = "" self.matrix = [] self.rules = [] self.prorules = [] self.postprocessing = [] self.nr_sp_release = "" self.nr_sp_seqs = "" self.nr_total = (None, None) self.nr_positive = (None, None) self.nr_unknown = (None, None) self.nr_false_pos = (None, None) self.nr_false_neg = None self.nr_partial = None self.cc_taxo_range = "" self.cc_max_repeat = "" self.cc_site = [] self.cc_skip_flag = "" self.dr_positive = [] self.dr_false_neg = [] self.dr_false_pos = [] self.dr_potential = [] self.dr_unknown = [] self.pdb_structs = [] # Everything below are private functions def __read(handle): import re record = None for line in handle: keyword, value = line[:2], line[5:].rstrip() if keyword == "ID": record = Record() cols = value.split("; ") if len(cols) != 2: raise ValueError("I don't understand identification line\n%s" % line) record.name = cols[0] record.type = cols[1].rstrip(".") # don't want '.' elif keyword == "AC": record.accession = value.rstrip(";") elif keyword == "DT": # e.g. from January 2017, # DT 01-APR-1990 CREATED; 01-APR-1990 DATA UPDATE; 01-APR-1990 INFO UPDATE. # Older files had brackets round the date descriptions and used MMM-YYYY dates = value.rstrip(".").split("; ") if dates[0].endswith((" (CREATED)", " CREATED")): # Remove last word record.created = dates[0].rsplit(" ", 1)[0] else: raise ValueError("I don't understand date line\n%s" % line) if dates[1].endswith((" (DATA UPDATE)", " DATA UPDATE")): # Remove last two words record.data_update = dates[1].rsplit(" ", 2)[0] else: raise ValueError("I don't understand date line\n%s" % line) if dates[2].endswith((" (INFO UPDATE)", " INFO UPDATE")): # Remove last two words record.info_update = dates[2].rsplit(" ", 2)[0] else: raise ValueError("I don't understand date line\n%s" % line) elif keyword == "DE": record.description = value elif keyword == "PA": record.pattern += value elif keyword == "MA": record.matrix.append(value) elif keyword == "PP": record.postprocessing.extend(value.split(";")) elif keyword == "RU": record.rules.append(value) elif keyword == "NR": cols = value.split(";") for col in cols: if not col: continue qual, data = [word.lstrip() for word in col.split("=")] if qual == "/RELEASE": release, seqs = data.split(",") record.nr_sp_release = release record.nr_sp_seqs = int(seqs) elif qual == "/FALSE_NEG": record.nr_false_neg = int(data) elif qual == "/PARTIAL": record.nr_partial = int(data) elif qual in ["/TOTAL", "/POSITIVE", "/UNKNOWN", "/FALSE_POS"]: m = re.match(r"(\d+)\((\d+)\)", data) if not m: raise Exception( "Broken data %s in comment line\n%s" % (repr(data), line) ) hits = tuple(map(int, m.groups())) if qual == "/TOTAL": record.nr_total = hits elif qual == "/POSITIVE": record.nr_positive = hits elif qual == "/UNKNOWN": record.nr_unknown = hits elif qual == "/FALSE_POS": record.nr_false_pos = hits else: raise ValueError( "Unknown qual %s in comment line\n%s" % (repr(qual), line) ) elif keyword == "CC": # Expect CC lines like this: # CC /TAXO-RANGE=??EPV; /MAX-REPEAT=2; # Can (normally) split on ";" and then on "=" cols = value.split(";") for col in cols: if not col or col[:17] == "Automatic scaling": # DNAJ_2 in Release 15 has a non-standard comment line: # CC Automatic scaling using reversed database # Throw it away. (Should I keep it?) continue if col.count("=") == 0: # Missing qualifier! Can we recover gracefully? # For example, from Bug 2403, in PS50293 have: # CC /AUTHOR=K_Hofmann; N_Hulo continue qual, data = [word.lstrip() for word in col.split("=")] if qual == "/TAXO-RANGE": record.cc_taxo_range = data elif qual == "/MAX-REPEAT": record.cc_max_repeat = data elif qual == "/SITE": pos, desc = data.split(",") record.cc_site.append((int(pos), desc)) elif qual == "/SKIP-FLAG": record.cc_skip_flag = data elif qual == "/MATRIX_TYPE": record.cc_matrix_type = data elif qual == "/SCALING_DB": record.cc_scaling_db = data elif qual == "/AUTHOR": record.cc_author = data elif qual == "/FT_KEY": record.cc_ft_key = data elif qual == "/FT_DESC": record.cc_ft_desc = data elif qual == "/VERSION": record.cc_version = data else: raise ValueError( "Unknown qual %s in comment line\n%s" % (repr(qual), line) ) elif keyword == "DR": refs = value.split(";") for ref in refs: if not ref: continue acc, name, type = [word.strip() for word in ref.split(",")] if type == "T": record.dr_positive.append((acc, name)) elif type == "F": record.dr_false_pos.append((acc, name)) elif type == "N": record.dr_false_neg.append((acc, name)) elif type == "P": record.dr_potential.append((acc, name)) elif type == "?": record.dr_unknown.append((acc, name)) else: raise ValueError("I don't understand type flag %s" % type) elif keyword == "3D": cols = value.split() for id in cols: record.pdb_structs.append(id.rstrip(";")) elif keyword == "PR": rules = value.split(";") record.prorules.extend(rules) elif keyword == "DO": record.pdoc = value.rstrip(";") elif keyword == "CC": continue elif keyword == "//": if not record: # Then this was the copyright statement continue break else: raise ValueError("Unknown keyword %s found" % keyword) else: return if not record: raise ValueError("Unexpected end of stream.") return record