# Information about the IUPAC alphabets protein_letters = "ACDEFGHIKLMNPQRSTVWY" extended_protein_letters = "ACDEFGHIKLMNPQRSTVWYBXZJUO" # B = "Asx"; aspartic acid or asparagine (D or N) # X = "Xxx"; unknown or 'other' amino acid # Z = "Glx"; glutamic acid or glutamine (E or Q) # http://www.chem.qmul.ac.uk/iupac/AminoAcid/A2021.html#AA212 # # J = "Xle"; leucine or isoleucine (L or I, used in NMR) # Mentioned in http://www.chem.qmul.ac.uk/iubmb/newsletter/1999/item3.html # Also the International Nucleotide Sequence Database Collaboration (INSDC) # (i.e. GenBank, EMBL, DDBJ) adopted this in 2006 # http://www.ddbj.nig.ac.jp/insdc/icm2006-e.html # # Xle (J); Leucine or Isoleucine # The residue abbreviations, Xle (the three-letter abbreviation) and J # (the one-letter abbreviation) are reserved for the case that cannot # experimentally distinguish leucine from isoleucine. # # U = "Sec"; selenocysteine # http://www.chem.qmul.ac.uk/iubmb/newsletter/1999/item3.html # # O = "Pyl"; pyrrolysine # http://www.chem.qmul.ac.uk/iubmb/newsletter/2009.html#item35 protein_letters_1to3 = { 'A': 'Ala', 'C': 'Cys', 'D': 'Asp', 'E': 'Glu', 'F': 'Phe', 'G': 'Gly', 'H': 'His', 'I': 'Ile', 'K': 'Lys', 'L': 'Leu', 'M': 'Met', 'N': 'Asn', 'P': 'Pro', 'Q': 'Gln', 'R': 'Arg', 'S': 'Ser', 'T': 'Thr', 'V': 'Val', 'W': 'Trp', 'Y': 'Tyr', } protein_letters_1to3_extended = dict(list(protein_letters_1to3.items()) + list({ 'B': 'Asx', 'X': 'Xaa', 'Z': 'Glx', 'J': 'Xle', 'U': 'Sel', 'O': 'Pyl', }.items())) protein_letters_3to1 = dict((x[1], x[0]) for x in protein_letters_1to3.items()) protein_letters_3to1_extended = dict((x[1], x[0]) for x in protein_letters_1to3_extended.items()) ambiguous_dna_letters = "GATCRYWSMKHBVDN" unambiguous_dna_letters = "GATC" ambiguous_rna_letters = "GAUCRYWSMKHBVDN" unambiguous_rna_letters = "GAUC" # B == 5-bromouridine # D == 5,6-dihydrouridine # S == thiouridine # W == wyosine extended_dna_letters = "GATCBDSW" # are there extended forms? #extended_rna_letters = "GAUCBDSW" ambiguous_dna_values = { "A": "A", "C": "C", "G": "G", "T": "T", "M": "AC", "R": "AG", "W": "AT", "S": "CG", "Y": "CT", "K": "GT", "V": "ACG", "H": "ACT", "D": "AGT", "B": "CGT", "X": "GATC", "N": "GATC", } ambiguous_rna_values = { "A": "A", "C": "C", "G": "G", "U": "U", "M": "AC", "R": "AG", "W": "AU", "S": "CG", "Y": "CU", "K": "GU", "V": "ACG", "H": "ACU", "D": "AGU", "B": "CGU", "X": "GAUC", "N": "GAUC", } ambiguous_dna_complement = { "A": "T", "C": "G", "G": "C", "T": "A", "M": "K", "R": "Y", "W": "W", "S": "S", "Y": "R", "K": "M", "V": "B", "H": "D", "D": "H", "B": "V", "X": "X", "N": "N", } ambiguous_rna_complement = { "A": "U", "C": "G", "G": "C", "U": "A", "M": "K", "R": "Y", "W": "W", "S": "S", "Y": "R", "K": "M", "V": "B", "H": "D", "D": "H", "B": "V", "X": "X", "N": "N", } def _make_ranges(mydict): d = {} for key, value in mydict.items(): d[key] = (value, value) return d # Mass data taken from PubChem # Average masses of monophosphate deoxy nucleotides unambiguous_dna_weights = { "A": 331.2218, "C": 307.1971, "G": 347.2212, "T": 322.2085 } # Monoisotopic masses of monophospate deoxy nucleotides monoisotopic_unambiguous_dna_weights = { "A": 331.06817, "C": 307.056936, "G": 347.063084, "T": 322.056602 } unambiguous_dna_weight_ranges = _make_ranges(unambiguous_dna_weights) unambiguous_rna_weights = { "A": 347.2212, "C": 323.1965, "G": 363.2206, "U": 324.1813 } monoisotopic_unambiguous_rna_weights = { "A": 347.063084, "C": 323.051851, "G": 363.057999, "U": 324.035867 } unambiguous_rna_weight_ranges = _make_ranges(unambiguous_rna_weights) def _make_ambiguous_ranges(mydict, weight_table): range_d = {} avg_d = {} for letter, values in mydict.items(): #Following line is a quick hack to skip undefined weights for U and O if len(values) == 1 and values[0] not in weight_table: continue weights = [weight_table.get(x) for x in values] range_d[letter] = (min(weights), max(weights)) total_w = 0.0 for w in weights: total_w = total_w + w avg_d[letter] = total_w / len(weights) return range_d, avg_d ambiguous_dna_weight_ranges, avg_ambiguous_dna_weights = \ _make_ambiguous_ranges(ambiguous_dna_values, unambiguous_dna_weights) ambiguous_rna_weight_ranges, avg_ambiguous_rna_weights = \ _make_ambiguous_ranges(ambiguous_rna_values, unambiguous_rna_weights) protein_weights = { "A": 89.0932, "C": 121.1582, "D": 133.1027, "E": 147.1293, "F": 165.1891, "G": 75.0666, "H": 155.1546, "I": 131.1729, "K": 146.1876, "L": 131.1729, "M": 149.2113, "N": 132.1179, "O": 255.3134, "P": 115.1305, "Q": 146.1445, "R": 174.201, "S": 105.0926, "T": 119.1192, "U": 168.0532, "V": 117.1463, "W": 204.2252, "Y": 181.1885 } monoisotopic_protein_weights = { "A": 89.047678, "C": 121.019749, "D": 133.037508, "E": 147.053158, "F": 165.078979, "G": 75.032028, "H": 155.069477, "I": 131.094629, "K": 146.105528, "L": 131.094629, "M": 149.051049, "N": 132.053492, "O": 255.158292, "P": 115.063329, "Q": 146.069142, "R": 174.111676, "S": 105.042593, "T": 119.058243, "U": 168.964203, "V": 117.078979, "W": 204.089878, "Y": 181.073893, } extended_protein_values = { "A": "A", "B": "ND", "C": "C", "D": "D", "E": "E", "F": "F", "G": "G", "H": "H", "I": "I", "J": "IL", "K": "K", "L": "L", "M": "M", "N": "N", "O": "O", "P": "P", "Q": "Q", "R": "R", "S": "S", "T": "T", "U": "U", "V": "V", "W": "W", "X": "ACDEFGHIKLMNPQRSTVWY", #TODO - Include U and O in the possible values of X? #This could alter the extended_protein_weight_ranges ... #by MP: Won't do this, because they are so rare. "Y": "Y", "Z": "QE", } protein_weight_ranges = _make_ranges(protein_weights) extended_protein_weight_ranges, avg_extended_protein_weights = \ _make_ambiguous_ranges(extended_protein_values, protein_weights) # For Center of Mass Calculation. # Taken from http://www.chem.qmul.ac.uk/iupac/AtWt/ & PyMol atom_weights = { 'H': 1.00794, 'He': 4.002602, 'Li': 6.941, 'Be': 9.012182, 'B': 10.811, 'C': 12.0107, 'N': 14.0067, 'O': 15.9994, 'F': 18.9984032, 'Ne': 20.1797, 'Na': 22.989770, 'Mg': 24.3050, 'Al': 26.981538, 'Si': 28.0855, 'P': 30.973761, 'S': 32.065, 'Cl': 35.453, 'Ar': 39.948, 'K': 39.0983, 'Ca': 40.078, 'Sc': 44.955910, 'Ti': 47.867, 'V': 50.9415, 'Cr': 51.9961, 'Mn': 54.938049, 'Fe': 55.845, 'Co': 58.933200, 'Ni': 58.6934, 'Cu': 63.546, 'Zn': 65.39, 'Ga': 69.723, 'Ge': 72.64, 'As': 74.92160, 'Se': 78.96, 'Br': 79.904, 'Kr': 83.80, 'Rb': 85.4678, 'Sr': 87.62, 'Y': 88.90585, 'Zr': 91.224, 'Nb': 92.90638, 'Mo': 95.94, 'Tc': 98.0, 'Ru': 101.07, 'Rh': 102.90550, 'Pd': 106.42, 'Ag': 107.8682, 'Cd': 112.411, 'In': 114.818, 'Sn': 118.710, 'Sb': 121.760, 'Te': 127.60, 'I': 126.90447, 'Xe': 131.293, 'Cs': 132.90545, 'Ba': 137.327, 'La': 138.9055, 'Ce': 140.116, 'Pr': 140.90765, 'Nd': 144.24, 'Pm': 145.0, 'Sm': 150.36, 'Eu': 151.964, 'Gd': 157.25, 'Tb': 158.92534, 'Dy': 162.50, 'Ho': 164.93032, 'Er': 167.259, 'Tm': 168.93421, 'Yb': 173.04, 'Lu': 174.967, 'Hf': 178.49, 'Ta': 180.9479, 'W': 183.84, 'Re': 186.207, 'Os': 190.23, 'Ir': 192.217, 'Pt': 195.078, 'Au': 196.96655, 'Hg': 200.59, 'Tl': 204.3833, 'Pb': 207.2, 'Bi': 208.98038, 'Po': 208.98, 'At': 209.99, 'Rn': 222.02, 'Fr': 223.02, 'Ra': 226.03, 'Ac': 227.03, 'Th': 232.0381, 'Pa': 231.03588, 'U': 238.02891, 'Np': 237.05, 'Pu': 244.06, 'Am': 243.06, 'Cm': 247.07, 'Bk': 247.07, 'Cf': 251.08, 'Es': 252.08, 'Fm': 257.10, 'Md': 258.10, 'No': 259.10, 'Lr': 262.11, 'Rf': 261.11, 'Db': 262.11, 'Sg': 266.12, 'Bh': 264.12, 'Hs': 269.13, 'Mt': 268.14, }