# Copyright 2004 by Iddo Friedberg. # All rights reserved. # # This file is part of the Biopython distribution and governed by your # choice of the "Biopython License Agreement" or the "BSD 3-Clause License". # Please see the LICENSE file that should have been included as part of this # package. """Reduced alphabets which lump together several amino-acids into one letter. Reduced (redundant or simplified) alphabets are used to represent protein sequences using an alternative alphabet which lumps together several amino-acids into one letter, based on physico-chemical traits. For example, all the aliphatics (I,L,V) are usually quite interchangeable, so many sequence studies group them into one letter Examples of reduced alphabets are available in: http://viscose.herokuapp.com/html/alphabets.html The Murphy tables are from here: Murphy L.R., Wallqvist A, Levy RM. (2000) Simplified amino acid alphabets for protein fold recognition and implications for folding. Protein Eng. 13(3):149-152 These alphabets have been used with Bio.utils.reduce_sequence, which has been removed from Biopython. You can use this is alphabets and tables like this: >>> from Bio.Seq import Seq >>> from Bio import Alphabet >>> from Bio.Alphabet import Reduced >>> my_protein = Seq('MAGSKEWKRFCELTINEA', Alphabet.ProteinAlphabet()) Now, we convert this sequence into a sequence which only recognizes polar (P) or hydrophobic (H) residues: >>> new_protein = Seq('', Alphabet.Reduced.HPModel()) >>> for aa in my_protein: ... new_protein += Alphabet.Reduced.hp_model_tab[aa] >>> new_protein Seq('HPPPPPHPPHHPHPHPPP', HPModel()) The following Alphabet classes are available: - Murphy15: Maps 20 amino acids to 15, use murphy_15_tab for conversion, ambiguous letters: L: LVIM, F: FY, K: KR - Murphy10: Maps 20 amino acids to 10, use murphy_10_tab for conversion, ambiguous letters: L: LVIM, S: ST, F: FYW, E: EDNQ, K: KR - Murphy8: Maps 20 amino acids to 8, use murphy_8_tab for conversion, ambiguous letters: L: LVIMC, A: AG, S: ST, F: FYW, E: EDNQ, K: KR - Murphy4: Maps 20 amino acids to 4, use murphy_4_tab for conversion, ambiguous letters: L: LVIMC, A: AGSTP, F: FYW, E: EDNQKRH - HPModel: Groups amino acids as polar (hydrophilic) or hydrophobic (non-polar), use hp_model_tab for conversion, P: AGTSNQDEHRKP, H: CMFILVWY - PC5: Amino acids grouped according to 5 physico-chemical properties, use pc_5_table for conversion, A (Aliphatic): IVL, R (aRomatic): FYWH, C (Charged): KRDE, T (Tiny): GACS, D (Diverse): TMQNP """ from Bio import Alphabet murphy_15_tab = {"L": "L", "V": "L", "I": "L", "M": "L", "C": "C", "A": "A", "G": "G", "S": "S", "T": "T", "P": "P", "F": "F", "Y": "F", "W": "W", "E": "E", "D": "D", "N": "N", "Q": "Q", "K": "K", "R": "K", "H": "H"} class Murphy15(Alphabet.ProteinAlphabet): """Reduced protein alphabet with 15 letters. Letters: A, C, D, E, G, H, N, P, Q, S, T, W, L(LVIM), F(FY), K(KR) """ letters = "LCAGSTPFWEDNQKH" size = 1 murphy_15 = Murphy15() murphy_10_tab = {"L": "L", "V": "L", "I": "L", "M": "L", "C": "C", "A": "A", "G": "G", "S": "S", "T": "S", "P": "P", "F": "F", "Y": "F", "W": "F", "E": "E", "D": "E", "N": "E", "Q": "E", "K": "K", "R": "K", "H": "H"} class Murphy10(Alphabet.ProteinAlphabet): """Reduced protein alphabet with 10 letters. Letters: A, C, G, H, P, L(LVIM), S(ST), F(FYW), E(EDNQ), K(KR) """ letters = "LCAGSPFEKH" size = 1 murphy_10 = Murphy10() murphy_8_tab = {"L": "L", "V": "L", "I": "L", "M": "L", "C": "L", "A": "A", "G": "A", "S": "S", "T": "S", "P": "P", "F": "F", "Y": "F", "W": "F", "E": "E", "D": "E", "N": "E", "Q": "E", "K": "K", "R": "K", "H": "H"} class Murphy8(Alphabet.ProteinAlphabet): """Reduced protein alphabet with 8 letters. Letters: H, P, L(LVIMC), A(AG), S(ST), F(FYW), E(EDNQ), K(KR) """ letters = "LASPFEKH" size = 1 murphy_8 = Murphy8() murphy_4_tab = {"L": "L", "V": "L", "I": "L", "M": "L", "C": "L", "A": "A", "G": "A", "S": "A", "T": "A", "P": "A", "F": "F", "Y": "F", "W": "F", "E": "E", "D": "E", "N": "E", "Q": "E", "K": "E", "R": "E", "H": "E"} class Murphy4(Alphabet.ProteinAlphabet): """Reduced protein alphabet with 4 letters. Letters: L(LVIMC), A(AGSTP), F(FYW), E(EDNQKRH) """ letters = "LAFE" size = 1 murphy_4 = Murphy4() hp_model_tab = {"A": "P", # Hydrophilic "G": "P", "T": "P", "S": "P", "N": "P", "Q": "P", "D": "P", "E": "P", "H": "P", "R": "P", "K": "P", "P": "P", "C": "H", # Hydrophobic "M": "H", "F": "H", "I": "H", "L": "H", "V": "H", "W": "H", "Y": "H"} class HPModel(Alphabet.ProteinAlphabet): """Reduced protein alphabet with only two letters for polar or hydophobic. Letters: P (polar: AGTSNQDEHRKP), H (hydrophobic: CMFILVWY) """ letters = "HP" size = 1 hp_model = HPModel() pc_5_table = {"I": "A", # Aliphatic "V": "A", "L": "A", "F": "R", # Aromatic "Y": "R", "W": "R", "H": "R", "K": "C", # Charged "R": "C", "D": "C", "E": "C", "G": "T", # Tiny "A": "T", "C": "T", "S": "T", "T": "D", # Diverse "M": "D", "Q": "D", "N": "D", "P": "D"} class PC5(Alphabet.ProteinAlphabet): """Reduced protein alphabet with 5 letters for physico-chemical properties. Letters: A (Aliphatic: IVL), R (aRomatic: FYWH), C (Charged: KRDE), T (Tiny: GACS), D (Diverse: TMQNP) """ letters = "ARCTD" size = 1 pc5 = PC5()