#!/usr/bin/env python # Copyright 2004 Kristian Rother # # This code is part of the Biopython distribution and governed by its # license. Please see the LICENSE file that should have been included # as part of this package. # """Parse header of PDB files into a python dictionary. Emerged from the Columba database project www.columba-db.de. Original author: Kristian Rother. """ # license: same as Biopython, read LICENSE.TXT from current Biopython release. # # last modified: 9.2.2004 # # Added some small changes: the whole PDB file is not read in anymore, but just # until the first ATOM record (faster). I also split parse_pdb_header into # parse_pdb_header and parse_pdb_header_list, because parse_pdb_header_list # can be more easily reused in PDBParser. # # Thomas, 19/03/04 # # Renamed some clearly private functions to _something (ie. parse_pdb_header_list # is now _parse_pdb_header_list) # Thomas 9/05/04 from __future__ import print_function import re from Bio import File def _get_journal(inl): # JRNL AUTH L.CHEN,M.DOI,F.S.MATHEWS,A.Y.CHISTOSERDOV, 2BBK 7 journal = "" for l in inl: if re.search(r"\AJRNL", l): journal += l[19:72].lower() journal = re.sub(r"\s\s+", " ", journal) return journal def _get_references(inl): # REMARK 1 REFERENCE 1 1CSE 11 # REMARK 1 AUTH W.BODE,E.PAPAMOKOS,D.MUSIL 1CSE 12 references = [] actref = "" for l in inl: if re.search(r"\AREMARK 1", l): if re.search(r"\AREMARK 1 REFERENCE", l): if actref != "": actref = re.sub(r"\s\s+", " ", actref) if actref != " ": references.append(actref) actref = "" else: actref += l[19:72].lower() if actref != "": actref = re.sub(r"\s\s+", " ", actref) if actref != " ": references.append(actref) return references # bring dates to format: 1909-01-08 def _format_date(pdb_date): """Convert dates from DD-Mon-YY to YYYY-MM-DD format (PRIVATE).""" date = "" year = int(pdb_date[7:]) if year < 50: century = 2000 else: century = 1900 date = str(century + year) + "-" all_months = ['xxx', 'Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'] month = str(all_months.index(pdb_date[3:6])) if len(month) == 1: month = '0' + month date = date + month + '-' + pdb_date[:2] return date def _chop_end_codes(line): """Chops lines ending with ' 1CSA 14' and the like (PRIVATE).""" return re.sub(r"\s\s\s\s+[\w]{4}.\s+\d*\Z", "", line) def _chop_end_misc(line): """Chops lines ending with ' 14-JUL-97 1CSA' and the like (PRIVATE).""" return re.sub(r"\s\s\s\s+.*\Z", "", line) def _nice_case(line): """Make A Lowercase String With Capitals (PRIVATE).""" line_lower = line.lower() s = "" i = 0 nextCap = 1 while i < len(line_lower): c = line_lower[i] if c >= 'a' and c <= 'z' and nextCap: c = c.upper() nextCap = 0 elif c in ' .,;:\t-_': nextCap = 1 s += c i += 1 return s def parse_pdb_header(infile): """Return the header lines of a pdb file as a dictionary. Dictionary keys are: head, deposition_date, release_date, structure_method, resolution, structure_reference, journal_reference, author and compound. """ header = [] with File.as_handle(infile, 'r') as f: for l in f: record_type = l[0:6] if record_type in ("ATOM ", "HETATM", "MODEL "): break else: header.append(l) return _parse_pdb_header_list(header) def _parse_pdb_header_list(header): # database fields dict = { 'name': "", 'head': '', 'deposition_date': "1909-01-08", 'release_date': "1909-01-08", 'structure_method': "unknown", 'resolution': 0.0, 'structure_reference': "unknown", 'journal_reference': "unknown", 'author': "", 'compound': {'1': {'misc': ''}}, 'source': {'1': {'misc': ''}}} dict['structure_reference'] = _get_references(header) dict['journal_reference'] = _get_journal(header) comp_molid = "1" last_comp_key = "misc" last_src_key = "misc" for hh in header: h = re.sub(r"[\s\n\r]*\Z", "", hh) # chop linebreaks off # key=re.sub("\s.+\s*","",h) key = h[:6].strip() # tail=re.sub("\A\w+\s+\d*\s*","",h) tail = h[10:].strip() # print("%s:%s" % (key, tail) # From here, all the keys from the header are being parsed if key == "TITLE": name = _chop_end_codes(tail).lower() if 'name' in dict: dict['name'] += " " + name else: dict['name'] = name elif key == "HEADER": rr = re.search(r"\d\d-\w\w\w-\d\d", tail) if rr is not None: dict['deposition_date'] = _format_date(_nice_case(rr.group())) head = _chop_end_misc(tail).lower() dict['head'] = head elif key == "COMPND": tt = re.sub(r"\;\s*\Z", "", _chop_end_codes(tail)).lower() # look for E.C. numbers in COMPND lines rec = re.search(r'\d+\.\d+\.\d+\.\d+', tt) if rec: dict['compound'][comp_molid]['ec_number'] = rec.group() tt = re.sub(r"\((e\.c\.)*\d+\.\d+\.\d+\.\d+\)", "", tt) tok = tt.split(":") if len(tok) >= 2: ckey = tok[0] cval = re.sub(r"\A\s*", "", tok[1]) if ckey == 'mol_id': dict['compound'][cval] = {'misc': ''} comp_molid = cval last_comp_key = "misc" else: dict['compound'][comp_molid][ckey] = cval last_comp_key = ckey else: dict['compound'][comp_molid][last_comp_key] += tok[0] + " " elif key == "SOURCE": tt = re.sub(r"\;\s*\Z", "", _chop_end_codes(tail)).lower() tok = tt.split(":") # print(tok) if len(tok) >= 2: ckey = tok[0] cval = re.sub(r"\A\s*", "", tok[1]) if ckey == 'mol_id': dict['source'][cval] = {'misc': ''} comp_molid = cval last_src_key = "misc" else: dict['source'][comp_molid][ckey] = cval last_src_key = ckey else: dict['source'][comp_molid][last_src_key] += tok[0] + " " elif key == "KEYWDS": kwd = _chop_end_codes(tail).lower() if 'keywords' in dict: dict['keywords'] += " " + kwd else: dict['keywords'] = kwd elif key == "EXPDTA": expd = _chop_end_codes(tail) # chop junk at end of lines for some structures expd = re.sub(r'\s\s\s\s\s\s\s.*\Z', '', expd) # if re.search('\Anmr',expd,re.IGNORECASE): expd='nmr' # if re.search('x-ray diffraction',expd,re.IGNORECASE): expd='x-ray diffraction' dict['structure_method'] = expd.lower() elif key == "CAVEAT": # make Annotation entries out of these!!! pass elif key == "REVDAT": rr = re.search(r"\d\d-\w\w\w-\d\d", tail) if rr is not None: dict['release_date'] = _format_date(_nice_case(rr.group())) elif key == "JRNL": # print("%s:%s" % (key, tail)) if 'journal' in dict: dict['journal'] += tail else: dict['journal'] = tail elif key == "AUTHOR": auth = _nice_case(_chop_end_codes(tail)) if 'author' in dict: dict['author'] += auth else: dict['author'] = auth elif key == "REMARK": if re.search("REMARK 2 RESOLUTION.", hh): r = _chop_end_codes(re.sub("REMARK 2 RESOLUTION.", '', hh)) r = re.sub(r"\s+ANGSTROM.*", "", r) try: dict['resolution'] = float(r) except ValueError: # print('nonstandard resolution %r' % r) dict['resolution'] = None else: # print(key) pass if dict['structure_method'] == 'unknown': if dict['resolution'] > 0.0: dict['structure_method'] = 'x-ray diffraction' return dict