# Copyright (C) 2002, Thomas Hamelryck (thamelry@binf.ku.dk) # This code is part of the Biopython distribution and governed by its # license. Please see the LICENSE file that should have been included # as part of this package. """Use the DSSP program to calculate secondary structure and accessibility. You need to have a working version of DSSP (and a license, free for academic use) in order to use this. For DSSP, see U{http://swift.cmbi.ru.nl/gv/dssp/}. The DSSP codes for secondary structure used here are: - H Alpha helix (4-12) - B Isolated beta-bridge residue - E Strand - G 3-10 helix - I pi helix - T Turn - S Bend - - None """ from __future__ import print_function __docformat__ = "epytext en" import re from Bio._py3k import StringIO import subprocess from Bio.Data import SCOPData from Bio.PDB.AbstractPropertyMap import AbstractResiduePropertyMap from Bio.PDB.PDBExceptions import PDBException from Bio.PDB.PDBParser import PDBParser # Match C in DSSP _dssp_cys=re.compile('[a-z]') # Maximal ASA of amino acids # Values from Sander & Rost, (1994), Proteins, 20:216-226 # Used for relative accessibility MAX_ACC={} MAX_ACC["ALA"]=106.0 MAX_ACC["CYS"]=135.0 MAX_ACC["ASP"]=163.0 MAX_ACC["GLU"]=194.0 MAX_ACC["PHE"]=197.0 MAX_ACC["GLY"]=84.0 MAX_ACC["HIS"]=184.0 MAX_ACC["ILE"]=169.0 MAX_ACC["LYS"]=205.0 MAX_ACC["LEU"]=164.0 MAX_ACC["MET"]=188.0 MAX_ACC["ASN"]=157.0 MAX_ACC["PRO"]=136.0 MAX_ACC["GLN"]=198.0 MAX_ACC["ARG"]=248.0 MAX_ACC["SER"]=130.0 MAX_ACC["THR"]=142.0 MAX_ACC["VAL"]=142.0 MAX_ACC["TRP"]=227.0 MAX_ACC["TYR"]=222.0 def ss_to_index(ss): """ Secondary structure symbol to index. H=0 E=1 C=2 """ if ss=='H': return 0 if ss=='E': return 1 if ss=='C': return 2 assert 0 def dssp_dict_from_pdb_file(in_file, DSSP="dssp"): """ Create a DSSP dictionary from a PDB file. Example: >>> dssp_dict=dssp_dict_from_pdb_file("1fat.pdb") >>> aa, ss, acc=dssp_dict[('A', 1)] @param in_file: pdb file @type in_file: string @param DSSP: DSSP executable (argument to os.system) @type DSSP: string @return: a dictionary that maps (chainid, resid) to amino acid type, secondary structure code and accessibility. @rtype: {} """ #Using universal newlines is important on Python 3, this #gives unicode handles rather than bytes handles. p = subprocess.Popen([DSSP, in_file], universal_newlines=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) out, err = p.communicate() out_dict, keys = _make_dssp_dict(StringIO(out)) return out_dict, keys def make_dssp_dict(filename): """ Return a DSSP dictionary that maps (chainid, resid) to aa, ss and accessibility, from a DSSP file. @param filename: the DSSP output file @type filename: string """ with open(filename, "r") as handle: return _make_dssp_dict(handle) def _make_dssp_dict(handle): """ Return a DSSP dictionary that maps (chainid, resid) to aa, ss and accessibility, from an open DSSP file object. @param handle: the open DSSP output file handle @type handle: file """ dssp = {} start = 0 keys = [] for l in handle.readlines(): sl = l.split() if len(sl) < 2: continue if sl[1] == "RESIDUE": # Start parsing from here start = 1 continue if not start: continue if l[9] == " ": # Skip -- missing residue continue resseq = int(l[5:10]) icode = l[10] chainid = l[11] aa = l[13] ss = l[16] if ss == " ": ss = "-" try: acc = int(l[34:38]) phi = float(l[103:109]) psi = float(l[109:115]) except ValueError as exc: # DSSP output breaks its own format when there are >9999 # residues, since only 4 digits are allocated to the seq num # field. See 3kic chain T res 321, 1vsy chain T res 6077. # Here, look for whitespace to figure out the number of extra # digits, and shift parsing the rest of the line by that amount. if l[34] != ' ': shift = l[34:].find(' ') acc = int((l[34+shift:38+shift])) phi = float(l[103+shift:109+shift]) psi = float(l[109+shift:115+shift]) else: raise ValueError(exc) res_id = (" ", resseq, icode) dssp[(chainid, res_id)] = (aa, ss, acc, phi, psi) keys.append((chainid, res_id)) return dssp, keys class DSSP(AbstractResiduePropertyMap): """ Run DSSP on a pdb file, and provide a handle to the DSSP secondary structure and accessibility. Note that DSSP can only handle one model. Example: >>> p = PDBParser() >>> structure = p.get_structure("1MOT", "1MOT.pdb") >>> model = structure[0] >>> dssp = DSSP(model, "1MOT.pdb") >>> # DSSP data is accessed by a tuple (chain_id, res_id) >>> a_key = list(dssp)[2] >>> # residue object, secondary structure, solvent accessibility, >>> # relative accessiblity, phi, psi >>> dssp[a_key] (, 'H', 72, 0.67924528301886788, -61.200000000000003, -42.399999999999999) """ def __init__(self, model, pdb_file, dssp="dssp"): """ @param model: the first model of the structure @type model: L{Model} @param pdb_file: a PDB file @type pdb_file: string @param dssp: the dssp executable (ie. the argument to os.system) @type dssp: string """ # create DSSP dictionary dssp_dict, dssp_keys = dssp_dict_from_pdb_file(pdb_file, dssp) dssp_map = {} dssp_list = [] def resid2code(res_id): """Serialize a residue's resseq and icode for easy comparison.""" return '%s%s' % (res_id[1], res_id[2]) # Now create a dictionary that maps Residue objects to # secondary structure and accessibility, and a list of # (residue, (secondary structure, accessibility)) tuples for key in dssp_keys: chain_id, res_id = key chain = model[chain_id] try: res = chain[res_id] except KeyError: # In DSSP, HET field is not considered in residue identifier. # Thus HETATM records may cause unnecessary exceptions. # (See 3jui chain A res 593.) # Try the lookup again with all HETATM other than water res_seq_icode = resid2code(res_id) for r in chain: if r.id[0] not in (' ', 'W'): # Compare resseq + icode if resid2code(r.id) == res_seq_icode: # Found a matching residue res = r break else: raise KeyError(res_id) # For disordered residues of point mutations, BioPython uses the # last one as default, But DSSP takes the first one (alternative # location is blank, A or 1). See 1h9h chain E resi 22. # Here we select the res in which all atoms have altloc blank, A or # 1. If no such residues are found, simply use the first one appears # (as DSSP does). if res.is_disordered() == 2: for rk in res.disordered_get_id_list(): # All atoms in the disordered residue should have the same # altloc, so it suffices to check the altloc of the first # atom. altloc = res.child_dict[rk].get_list()[0].get_altloc() if altloc in tuple('A1 '): res.disordered_select(rk) break else: # Simply select the first one res.disordered_select(res.disordered_get_id_list()[0]) # Sometimes point mutations are put into HETATM and ATOM with altloc # 'A' and 'B'. # See 3piu chain A residue 273: # # # DSSP uses the HETATM LLP as it has altloc 'A' # We check the altloc code here. elif res.is_disordered() == 1: # Check altloc of all atoms in the DisorderedResidue. If it # contains blank, A or 1, then use it. Otherwise, look for HET # residues of the same seq+icode. If not such HET residues are # found, just accept the current one. altlocs = set(a.get_altloc() for a in res.get_unpacked_list()) if altlocs.isdisjoint('A1 '): # Try again with all HETATM other than water res_seq_icode = resid2code(res_id) for r in chain: if r.id[0] not in (' ', 'W'): if resid2code(r.id) == res_seq_icode and \ r.get_list()[0].get_altloc() in tuple('A1 '): res = r break aa, ss, acc, phi, psi = dssp_dict[key] res.xtra["SS_DSSP"] = ss res.xtra["EXP_DSSP_ASA"] = acc res.xtra["PHI_DSSP"] = phi res.xtra["PSI_DSSP"] = psi # Relative accessibility resname = res.get_resname() try: rel_acc = acc/MAX_ACC[resname] except KeyError: # Invalid value for resname rel_acc = 'NA' else: if rel_acc > 1.0: rel_acc = 1.0 res.xtra["EXP_DSSP_RASA"] = rel_acc # Verify if AA in DSSP == AA in Structure # Something went wrong if this is not true! # NB: DSSP uses X often resname = SCOPData.protein_letters_3to1.get(resname, 'X') if resname == "C": # DSSP renames C in C-bridges to a,b,c,d,... # - we rename it back to 'C' if _dssp_cys.match(aa): aa = 'C' # Take care of HETATM again if (resname != aa) and (res.id[0] == ' ' or aa != 'X'): raise PDBException("Structure/DSSP mismatch at %s" % res) dssp_map[key] = ((res, ss, acc, rel_acc, phi, psi)) dssp_list.append((res, ss, acc, rel_acc, phi, psi)) AbstractResiduePropertyMap.__init__(self, dssp_map, dssp_keys, dssp_list) if __name__ == "__main__": import sys p = PDBParser() s = p.get_structure('X', sys.argv[1]) model = s[0] d = DSSP(model, sys.argv[1]) for r in d: print(r) print("Handled %i residues" % len(d)) print(sorted(d)) if ('A', 1) in d: print(d[('A', 1)]) print(s[0]['A'][1].xtra) # Secondary structure print(''.join(item[1] for item in d))