# Copyright (C) 2002, Thomas Hamelryck (thamelry@binf.ku.dk) # # This file is part of the Biopython distribution and governed by your # choice of the "Biopython License Agreement" or the "BSD 3-Clause License". # Please see the LICENSE file that should have been included as part of this # package. """Polypeptide-related classes (construction and representation). Simple example with multiple chains, >>> from Bio.PDB.PDBParser import PDBParser >>> from Bio.PDB.Polypeptide import PPBuilder >>> structure = PDBParser().get_structure('2BEG', 'PDB/2BEG.pdb') >>> ppb=PPBuilder() >>> for pp in ppb.build_peptides(structure): ... print(pp.get_sequence()) LVFFAEDVGSNKGAIIGLMVGGVVIA LVFFAEDVGSNKGAIIGLMVGGVVIA LVFFAEDVGSNKGAIIGLMVGGVVIA LVFFAEDVGSNKGAIIGLMVGGVVIA LVFFAEDVGSNKGAIIGLMVGGVVIA Example with non-standard amino acids using HETATM lines in the PDB file, in this case selenomethionine (MSE): >>> from Bio.PDB.PDBParser import PDBParser >>> from Bio.PDB.Polypeptide import PPBuilder >>> structure = PDBParser().get_structure('1A8O', 'PDB/1A8O.pdb') >>> ppb=PPBuilder() >>> for pp in ppb.build_peptides(structure): ... print(pp.get_sequence()) DIRQGPKEPFRDYVDRFYKTLRAEQASQEVKNW TETLLVQNANPDCKTILKALGPGATLEE TACQG If you want to, you can include non-standard amino acids in the peptides: >>> for pp in ppb.build_peptides(structure, aa_only=False): ... print(pp.get_sequence()) ... print("%s %s" % (pp.get_sequence()[0], pp[0].get_resname())) ... print("%s %s" % (pp.get_sequence()[-7], pp[-7].get_resname())) ... print("%s %s" % (pp.get_sequence()[-6], pp[-6].get_resname())) MDIRQGPKEPFRDYVDRFYKTLRAEQASQEVKNWMTETLLVQNANPDCKTILKALGPGATLEEMMTACQG M MSE M MSE M MSE In this case the selenomethionines (the first and also seventh and sixth from last residues) have been shown as M (methionine) by the get_sequence method. """ from __future__ import print_function from Bio._py3k import basestring import warnings from Bio.Alphabet import generic_protein from Bio.Data import SCOPData from Bio.Seq import Seq from Bio.PDB.PDBExceptions import PDBException from Bio.PDB.vectors import calc_dihedral, calc_angle standard_aa_names = [ "ALA", "CYS", "ASP", "GLU", "PHE", "GLY", "HIS", "ILE", "LYS", "LEU", "MET", "ASN", "PRO", "GLN", "ARG", "SER", "THR", "VAL", "TRP", "TYR", ] aa1 = "ACDEFGHIKLMNPQRSTVWY" aa3 = standard_aa_names d1_to_index = {} dindex_to_1 = {} d3_to_index = {} dindex_to_3 = {} # Create some lookup tables for i in range(0, 20): n1 = aa1[i] n3 = aa3[i] d1_to_index[n1] = i dindex_to_1[i] = n1 d3_to_index[n3] = i dindex_to_3[i] = n3 def index_to_one(index): """Index to corresponding one letter amino acid name. >>> index_to_one(0) 'A' >>> index_to_one(19) 'Y' """ return dindex_to_1[index] def one_to_index(s): """One letter code to index. >>> one_to_index('A') 0 >>> one_to_index('Y') 19 """ return d1_to_index[s] def index_to_three(i): """Index to corresponding three letter amino acid name. >>> index_to_three(0) 'ALA' >>> index_to_three(19) 'TYR' """ return dindex_to_3[i] def three_to_index(s): """Three letter code to index. >>> three_to_index('ALA') 0 >>> three_to_index('TYR') 19 """ return d3_to_index[s] def three_to_one(s): """Three letter code to one letter code. >>> three_to_one('ALA') 'A' >>> three_to_one('TYR') 'Y' For non-standard amino acids, you get a KeyError: >>> three_to_one('MSE') Traceback (most recent call last): ... KeyError: 'MSE' """ i = d3_to_index[s] return dindex_to_1[i] def one_to_three(s): """One letter code to three letter code. >>> one_to_three('A') 'ALA' >>> one_to_three('Y') 'TYR' """ i = d1_to_index[s] return dindex_to_3[i] def is_aa(residue, standard=False): """Return True if residue object/string is an amino acid. :param residue: a L{Residue} object OR a three letter amino acid code :type residue: L{Residue} or string :param standard: flag to check for the 20 AA (default false) :type standard: boolean >>> is_aa('ALA') True Known three letter codes for modified amino acids are supported, >>> is_aa('FME') True >>> is_aa('FME', standard=True) False """ # TODO - What about special cases like XXX, can they appear in PDB files? if not isinstance(residue, basestring): residue = residue.get_resname() residue = residue.upper() if standard: return residue in d3_to_index else: return residue in SCOPData.protein_letters_3to1 class Polypeptide(list): """A polypeptide is simply a list of L{Residue} objects.""" def get_ca_list(self): """Get list of C-alpha atoms in the polypeptide. :return: the list of C-alpha atoms :rtype: [L{Atom}, L{Atom}, ...] """ ca_list = [] for res in self: ca = res["CA"] ca_list.append(ca) return ca_list def get_phi_psi_list(self): """Return the list of phi/psi dihedral angles.""" ppl = [] lng = len(self) for i in range(0, lng): res = self[i] try: n = res["N"].get_vector() ca = res["CA"].get_vector() c = res["C"].get_vector() except Exception: # Some atoms are missing # Phi/Psi cannot be calculated for this residue ppl.append((None, None)) res.xtra["PHI"] = None res.xtra["PSI"] = None continue # Phi if i > 0: rp = self[i - 1] try: cp = rp["C"].get_vector() phi = calc_dihedral(cp, n, ca, c) except Exception: phi = None else: # No phi for residue 0! phi = None # Psi if i < (lng - 1): rn = self[i + 1] try: nn = rn["N"].get_vector() psi = calc_dihedral(n, ca, c, nn) except Exception: psi = None else: # No psi for last residue! psi = None ppl.append((phi, psi)) # Add Phi/Psi to xtra dict of residue res.xtra["PHI"] = phi res.xtra["PSI"] = psi return ppl def get_tau_list(self): """List of tau torsions angles for all 4 consecutive Calpha atoms.""" ca_list = self.get_ca_list() tau_list = [] for i in range(0, len(ca_list) - 3): atom_list = (ca_list[i], ca_list[i + 1], ca_list[i + 2], ca_list[i + 3]) v1, v2, v3, v4 = [a.get_vector() for a in atom_list] tau = calc_dihedral(v1, v2, v3, v4) tau_list.append(tau) # Put tau in xtra dict of residue res = ca_list[i + 2].get_parent() res.xtra["TAU"] = tau return tau_list def get_theta_list(self): """List of theta angles for all 3 consecutive Calpha atoms.""" theta_list = [] ca_list = self.get_ca_list() for i in range(0, len(ca_list) - 2): atom_list = (ca_list[i], ca_list[i + 1], ca_list[i + 2]) v1, v2, v3 = [a.get_vector() for a in atom_list] theta = calc_angle(v1, v2, v3) theta_list.append(theta) # Put tau in xtra dict of residue res = ca_list[i + 1].get_parent() res.xtra["THETA"] = theta return theta_list def get_sequence(self): """Return the AA sequence as a Seq object. :return: polypeptide sequence :rtype: L{Seq} """ s = "" for res in self: s += SCOPData.protein_letters_3to1.get(res.get_resname(), "X") seq = Seq(s, generic_protein) return seq def __repr__(self): """Return string representation of the polypeptide. Return , where START and END are sequence identifiers of the outer residues. """ start = self[0].get_id()[1] end = self[-1].get_id()[1] s = "" % (start, end) return s class _PPBuilder(object): """Base class to extract polypeptides. It checks if two consecutive residues in a chain are connected. The connectivity test is implemented by a subclass. This assumes you want both standard and non-standard amino acids. """ def __init__(self, radius): """Initialize the base class. :param radius: distance :type radius: float """ self.radius = radius def _accept(self, residue, standard_aa_only): """Check if the residue is an amino acid (PRIVATE).""" if is_aa(residue, standard=standard_aa_only): return True elif not standard_aa_only and "CA" in residue.child_dict: # It has an alpha carbon... # We probably need to update the hard coded list of # non-standard residues, see function is_aa for details. warnings.warn( "Assuming residue %s is an unknown modified amino acid" % residue.get_resname() ) return True else: # not a standard AA so skip return False def build_peptides(self, entity, aa_only=1): """Build and return a list of Polypeptide objects. :param entity: polypeptides are searched for in this object :type entity: L{Structure}, L{Model} or L{Chain} :param aa_only: if 1, the residue needs to be a standard AA :type aa_only: int """ is_connected = self._is_connected accept = self._accept level = entity.get_level() # Decide which entity we are dealing with if level == "S": model = entity[0] chain_list = model.get_list() elif level == "M": chain_list = entity.get_list() elif level == "C": chain_list = [entity] else: raise PDBException("Entity should be Structure, Model or Chain.") pp_list = [] for chain in chain_list: chain_it = iter(chain) try: prev_res = next(chain_it) while not accept(prev_res, aa_only): prev_res = next(chain_it) except StopIteration: # No interesting residues at all in this chain continue pp = None for next_res in chain_it: if ( accept(prev_res, aa_only) and accept(next_res, aa_only) and is_connected(prev_res, next_res) ): if pp is None: pp = Polypeptide() pp.append(prev_res) pp_list.append(pp) pp.append(next_res) else: # Either too far apart, or one of the residues is unwanted. # End the current peptide pp = None prev_res = next_res return pp_list class CaPPBuilder(_PPBuilder): """Use CA--CA distance to find polypeptides.""" def __init__(self, radius=4.3): """Initialize the class.""" _PPBuilder.__init__(self, radius) def _is_connected(self, prev_res, next_res): for r in [prev_res, next_res]: if not r.has_id("CA"): return False n = next_res["CA"] p = prev_res["CA"] # Unpack disordered if n.is_disordered(): nlist = n.disordered_get_list() else: nlist = [n] if p.is_disordered(): plist = p.disordered_get_list() else: plist = [p] for nn in nlist: for pp in plist: if (nn - pp) < self.radius: return True return False class PPBuilder(_PPBuilder): """Use C--N distance to find polypeptides.""" def __init__(self, radius=1.8): """Initialize the class.""" _PPBuilder.__init__(self, radius) def _is_connected(self, prev_res, next_res): if not prev_res.has_id("C"): return False if not next_res.has_id("N"): return False test_dist = self._test_dist c = prev_res["C"] n = next_res["N"] # Test all disordered atom positions! if c.is_disordered(): clist = c.disordered_get_list() else: clist = [c] if n.is_disordered(): nlist = n.disordered_get_list() else: nlist = [n] for nn in nlist: for cc in clist: # To form a peptide bond, N and C must be # within radius and have the same altloc # identifier or one altloc blank n_altloc = nn.get_altloc() c_altloc = cc.get_altloc() if n_altloc == c_altloc or n_altloc == " " or c_altloc == " ": if test_dist(nn, cc): # Select the disordered atoms that # are indeed bonded if c.is_disordered(): c.disordered_select(c_altloc) if n.is_disordered(): n.disordered_select(n_altloc) return True return False def _test_dist(self, c, n): """Return 1 if distance between atoms