# Copyright 2000 Andrew Dalke. All rights reserved. # # This file is part of the Biopython distribution and governed by your # choice of the "Biopython License Agreement" or the "BSD 3-Clause License". # Please see the LICENSE file that should have been included as part of this # package. """Information about the IUPAC alphabets.""" protein_letters = "ACDEFGHIKLMNPQRSTVWY" extended_protein_letters = "ACDEFGHIKLMNPQRSTVWYBXZJUO" # B = "Asx"; aspartic acid or asparagine (D or N) # X = "Xxx"; unknown or 'other' amino acid # Z = "Glx"; glutamic acid or glutamine (E or Q) # http://www.chem.qmul.ac.uk/iupac/AminoAcid/A2021.html#AA212 # # J = "Xle"; leucine or isoleucine (L or I, used in NMR) # Mentioned in http://www.chem.qmul.ac.uk/iubmb/newsletter/1999/item3.html # Also the International Nucleotide Sequence Database Collaboration (INSDC) # (i.e. GenBank, EMBL, DDBJ) adopted this in 2006 # http://www.ddbj.nig.ac.jp/insdc/icm2006-e.html # # Xle (J); Leucine or Isoleucine # The residue abbreviations, Xle (the three-letter abbreviation) and J # (the one-letter abbreviation) are reserved for the case that cannot # experimentally distinguish leucine from isoleucine. # # U = "Sec"; selenocysteine # http://www.chem.qmul.ac.uk/iubmb/newsletter/1999/item3.html # # O = "Pyl"; pyrrolysine # http://www.chem.qmul.ac.uk/iubmb/newsletter/2009.html#item35 protein_letters_1to3 = { "A": "Ala", "C": "Cys", "D": "Asp", "E": "Glu", "F": "Phe", "G": "Gly", "H": "His", "I": "Ile", "K": "Lys", "L": "Leu", "M": "Met", "N": "Asn", "P": "Pro", "Q": "Gln", "R": "Arg", "S": "Ser", "T": "Thr", "V": "Val", "W": "Trp", "Y": "Tyr", } protein_letters_1to3_extended = dict(list(protein_letters_1to3.items()) + list({ "B": "Asx", "X": "Xaa", "Z": "Glx", "J": "Xle", "U": "Sec", "O": "Pyl", }.items())) protein_letters_3to1 = {x[1]: x[0] for x in protein_letters_1to3.items()} protein_letters_3to1_extended = {x[1]: x[0] for x in protein_letters_1to3_extended.items()} ambiguous_dna_letters = "GATCRYWSMKHBVDN" unambiguous_dna_letters = "GATC" ambiguous_rna_letters = "GAUCRYWSMKHBVDN" unambiguous_rna_letters = "GAUC" # B == 5-bromouridine # D == 5,6-dihydrouridine # S == thiouridine # W == wyosine extended_dna_letters = "GATCBDSW" # are there extended forms? # extended_rna_letters = "GAUCBDSW" # "X" is included in the following _values and _complement dictionaries, # for historical reasons although it is not an IUPAC nucleotide, # and so is not in the corresponding _letters strings above ambiguous_dna_values = { "A": "A", "C": "C", "G": "G", "T": "T", "M": "AC", "R": "AG", "W": "AT", "S": "CG", "Y": "CT", "K": "GT", "V": "ACG", "H": "ACT", "D": "AGT", "B": "CGT", "X": "GATC", "N": "GATC", } ambiguous_rna_values = { "A": "A", "C": "C", "G": "G", "U": "U", "M": "AC", "R": "AG", "W": "AU", "S": "CG", "Y": "CU", "K": "GU", "V": "ACG", "H": "ACU", "D": "AGU", "B": "CGU", "X": "GAUC", "N": "GAUC", } ambiguous_dna_complement = { "A": "T", "C": "G", "G": "C", "T": "A", "M": "K", "R": "Y", "W": "W", "S": "S", "Y": "R", "K": "M", "V": "B", "H": "D", "D": "H", "B": "V", "X": "X", "N": "N", } ambiguous_rna_complement = { "A": "U", "C": "G", "G": "C", "U": "A", "M": "K", "R": "Y", "W": "W", "S": "S", "Y": "R", "K": "M", "V": "B", "H": "D", "D": "H", "B": "V", "X": "X", "N": "N", } def _make_ranges(mydict): d = {} for key, value in mydict.items(): d[key] = (value, value) return d # Mass data taken from PubChem # Average masses of monophosphate deoxy nucleotides unambiguous_dna_weights = { "A": 331.2218, "C": 307.1971, "G": 347.2212, "T": 322.2085 } # Monoisotopic masses of monophospate deoxy nucleotides monoisotopic_unambiguous_dna_weights = { "A": 331.06817, "C": 307.056936, "G": 347.063084, "T": 322.056602 } unambiguous_dna_weight_ranges = _make_ranges(unambiguous_dna_weights) unambiguous_rna_weights = { "A": 347.2212, "C": 323.1965, "G": 363.2206, "U": 324.1813 } monoisotopic_unambiguous_rna_weights = { "A": 347.063084, "C": 323.051851, "G": 363.057999, "U": 324.035867 } unambiguous_rna_weight_ranges = _make_ranges(unambiguous_rna_weights) def _make_ambiguous_ranges(mydict, weight_table): range_d = {} avg_d = {} for letter, values in mydict.items(): # Following line is a quick hack to skip undefined weights for U and O if len(values) == 1 and values[0] not in weight_table: continue weights = [weight_table.get(x) for x in values] range_d[letter] = (min(weights), max(weights)) total_w = 0.0 for w in weights: total_w = total_w + w avg_d[letter] = total_w / len(weights) return range_d, avg_d ambiguous_dna_weight_ranges, avg_ambiguous_dna_weights = \ _make_ambiguous_ranges(ambiguous_dna_values, unambiguous_dna_weights) ambiguous_rna_weight_ranges, avg_ambiguous_rna_weights = \ _make_ambiguous_ranges(ambiguous_rna_values, unambiguous_rna_weights) protein_weights = { "A": 89.0932, "C": 121.1582, "D": 133.1027, "E": 147.1293, "F": 165.1891, "G": 75.0666, "H": 155.1546, "I": 131.1729, "K": 146.1876, "L": 131.1729, "M": 149.2113, "N": 132.1179, "O": 255.3134, "P": 115.1305, "Q": 146.1445, "R": 174.201, "S": 105.0926, "T": 119.1192, "U": 168.0532, "V": 117.1463, "W": 204.2252, "Y": 181.1885 } monoisotopic_protein_weights = { "A": 89.047678, "C": 121.019749, "D": 133.037508, "E": 147.053158, "F": 165.078979, "G": 75.032028, "H": 155.069477, "I": 131.094629, "K": 146.105528, "L": 131.094629, "M": 149.051049, "N": 132.053492, "O": 255.158292, "P": 115.063329, "Q": 146.069142, "R": 174.111676, "S": 105.042593, "T": 119.058243, "U": 168.964203, "V": 117.078979, "W": 204.089878, "Y": 181.073893, } extended_protein_values = { "A": "A", "B": "ND", "C": "C", "D": "D", "E": "E", "F": "F", "G": "G", "H": "H", "I": "I", "J": "IL", "K": "K", "L": "L", "M": "M", "N": "N", "O": "O", "P": "P", "Q": "Q", "R": "R", "S": "S", "T": "T", "U": "U", "V": "V", "W": "W", "X": "ACDEFGHIKLMNPQRSTVWY", # TODO - Include U and O in the possible values of X? # This could alter the extended_protein_weight_ranges ... # by MP: Won't do this, because they are so rare. "Y": "Y", "Z": "QE", } protein_weight_ranges = _make_ranges(protein_weights) extended_protein_weight_ranges, avg_extended_protein_weights = \ _make_ambiguous_ranges(extended_protein_values, protein_weights) # For Center of Mass Calculation. # Taken from http://www.chem.qmul.ac.uk/iupac/AtWt/ & PyMol atom_weights = { "H": 1.00794, "D": 2.01410, "He": 4.002602, "Li": 6.941, "Be": 9.012182, "B": 10.811, "C": 12.0107, "N": 14.0067, "O": 15.9994, "F": 18.9984032, "Ne": 20.1797, "Na": 22.989770, "Mg": 24.3050, "Al": 26.981538, "Si": 28.0855, "P": 30.973761, "S": 32.065, "Cl": 35.453, "Ar": 39.948, "K": 39.0983, "Ca": 40.078, "Sc": 44.955910, "Ti": 47.867, "V": 50.9415, "Cr": 51.9961, "Mn": 54.938049, "Fe": 55.845, "Co": 58.933200, "Ni": 58.6934, "Cu": 63.546, "Zn": 65.39, "Ga": 69.723, "Ge": 72.64, "As": 74.92160, "Se": 78.96, "Br": 79.904, "Kr": 83.80, "Rb": 85.4678, "Sr": 87.62, "Y": 88.90585, "Zr": 91.224, "Nb": 92.90638, "Mo": 95.94, "Tc": 98.0, "Ru": 101.07, "Rh": 102.90550, "Pd": 106.42, "Ag": 107.8682, "Cd": 112.411, "In": 114.818, "Sn": 118.710, "Sb": 121.760, "Te": 127.60, "I": 126.90447, "Xe": 131.293, "Cs": 132.90545, "Ba": 137.327, "La": 138.9055, "Ce": 140.116, "Pr": 140.90765, "Nd": 144.24, "Pm": 145.0, "Sm": 150.36, "Eu": 151.964, "Gd": 157.25, "Tb": 158.92534, "Dy": 162.50, "Ho": 164.93032, "Er": 167.259, "Tm": 168.93421, "Yb": 173.04, "Lu": 174.967, "Hf": 178.49, "Ta": 180.9479, "W": 183.84, "Re": 186.207, "Os": 190.23, "Ir": 192.217, "Pt": 195.078, "Au": 196.96655, "Hg": 200.59, "Tl": 204.3833, "Pb": 207.2, "Bi": 208.98038, "Po": 208.98, "At": 209.99, "Rn": 222.02, "Fr": 223.02, "Ra": 226.03, "Ac": 227.03, "Th": 232.0381, "Pa": 231.03588, "U": 238.02891, "Np": 237.05, "Pu": 244.06, "Am": 243.06, "Cm": 247.07, "Bk": 247.07, "Cf": 251.08, "Es": 252.08, "Fm": 257.10, "Md": 258.10, "No": 259.10, "Lr": 262.11, "Rf": 261.11, "Db": 262.11, "Sg": 266.12, "Bh": 264.12, "Hs": 269.13, "Mt": 268.14, }