#!/usr/bin/env python # Graham's protein statistic generator # ------------------------------------ # SYNOPSIS # Calculates the following statistics from FASTA protein sequences # # 1. the molecular weight of each FASTA sequence. # 2. the theoretical pI value of the FASTA sequence. # 3. the composition of each amino acid. # 4. the total number of amino acids in each protein # # USAGE # gstat.py # # Examples: # gstat.py ecoli-k12.fsa # gstat.py -kda ecoli-k12.fsa > data_save.csv # gstat.py -img ecoli-k12.fsa ecoli-plasmid1.fsa # gstat.py -kda -img ecoli-k12.fsa ecoli-plasmid1.fsa > data_save.csv # # INPUT FILES # This program ASSUMES that the files are already translated to protein. # This program does NOT translate DNA sequences to protein sequences at all. # # PREREQUISITES # This program requires the following modules (version numbers in parenthesis # are the version numbers I used, NOT the minimum or maximum version numbers): # # Python (2.6.1) # # OUTPUT # The output of the program is CSV (tab delimited) via stdout. # The columns outputted are as follows: # # NAME Mol. Wt. pI COMPOSITION SEQUENCE # # IF you are using multiple files, this program will print all # of the sequences in succession to standard output. # ################################### # THIS IS OPEN SOURCE CODE (YAY!) # ################################### # LICENSE # This code is licensed under the Creative Commons 3.0 # Attribution + ShareAlike license - for details see: # http://creativecommons.org/licenses/by-sa/3.0/ # # NOTE: this work, itself, is in part derrived from other # works. Please see the ACKNOWLEDGEMENTS/CO-AUTHORS # section for additional attributions. # # AUTHOR # Graham Alvare - home.cc.umanitoba.ca/~alvare # # ACKNOWLEDGEMENTS/CO-AUTHORS # Dr. Brian Fristensky - my work supervisor, and the man who introduced # me to the wonderful field of bioinformatics. # Lukasz Kozlowski - bisection Henderson-Hasselbach algorithm # for determining the theoretical pI of proteins # Kozlowski L. 2007-2012 Isoelectric Point Calculator. # http://isoelectric.ovh.org # Dr. Abby Perrill - amino acid pKa table # http://www.cem.msu.edu/~cem252/sp97/ch24/ch24aa.html # # QUESTIONS & COMMENTS # If you have any questions, please contact me: alvare@cc.umanitoba.ca # I usually get back to people within 1-2 weekdays (weekends, I am slower) # # P.S. please also let me know of any bugs, or if you have any suggestions # I am generally happy to help create new tools, or modify my existing # tools to make them more useful. # # Happy usage! # import re import sys # a list of the basic molecular weights of the basic amino acids # determined by periodic table and AA composition aaweights = { 'A' : 71.09, # alanine 'R' : 156.19, # arginine 'D' : 114.11, # aspartic acid 'N' : 115.09, # asparagine 'C' : 103.15, # cysteine 'E' : 129.12, # glutamic acid 'Q' : 128.14, # glutamine 'G' : 57.05, # glycine 'H' : 137.14, # histidine 'I' : 113.16, # isoleucine 'L' : 113.16, # leucine 'K' : 128.17, # lysine 'M' : 131.19, # methionine 'F' : 147.18, # phenylalanine 'P' : 97.12, # proline 'S' : 87.08, # serine 'T' : 101.11, # threonine 'W' : 186.12, # tryptophan 'Y' : 163.18, # tyrosine 'V' : 99.14 # valine } # a list of pKa's for amino acids # references from: http://www.cem.msu.edu/~cem252/sp97/ch24/ch24aa.html # on Tuesday October 2, 2012 # pka's are listed as such [c-terminus, n-terminus, side chain] aa_pkas = { 'A' : [2.35, 9.87 ], # alanine 'R' : [2.01, 9.04, 12.48], # arginine 'D' : [2.10, 9.82, 3.86], # aspartic acid 'N' : [2.02, 8.80 ], # asparagine 'C' : [2.05, 10.25, 8.00], # cysteine 'E' : [2.10, 9.47, 4.07], # glutamic acid 'Q' : [2.17, 9.13 ], # glutamine 'G' : [2.35, 9.78 ], # glycine 'H' : [1.77, 9.18, 6.10], # histidine 'I' : [2.32, 9.76 ], # isoleucine 'L' : [2.33, 9.74 ], # leucine 'K' : [2.18, 8.95, 10.53], # lysine 'M' : [2.28, 9.21 ], # methionine 'F' : [2.58, 9.24 ], # phenylalanine 'P' : [2.00, 10.60 ], # proline 'S' : [2.21, 9.15 ], # serine 'T' : [2.09, 9.10 ], # threonine 'W' : [2.38, 9.39 ], # tryptophan 'Y' : [2.20, 9.11, 10.07], # tyrosine 'V' : [2.29, 9.72 ] # valine } # the number of C, H, O, and N atoms in each amino acid # determined from AA structure aa_comp = { 'A' : [3, 5, 1, 1], # alanine - OK 'R' : [6, 12, 1, 4], # arginine 'D' : [4, 4, 3, 1], # aspartic acid - OK 'N' : [4, 7, 2, 2], # asparagine 'C' : [3, 5, 1, 1], # cysteine - OK 'E' : [5, 7, 3, 1], # glutamic acid 'Q' : [5, 8, 2, 2], # glutamine 'G' : [2, 3, 1, 1], # glycine 'H' : [6, 7, 1, 3], # histidine 'I' : [6, 11, 1, 1], # isoleucine 'L' : [6, 11, 1, 1], # leucine 'K' : [6, 12, 1, 2], # lysine 'M' : [5, 9, 1, 1], # methionine - OK 'F' : [9, 9, 1, 1], # phenylalanine 'P' : [5, 7, 1, 1], # proline ** 'S' : [3, 5, 2, 1], # serine 'T' : [4, 7, 2, 1], # threonine - OK 'W' : [11, 10, 1, 2], # tryptophan ** 'Y' : [9, 9, 2, 1], # tyrosine 'V' : [5, 9, 1, 1] # valine } ######## # Theoretical pI calculator ######## # This method esimates the theoretical pI of a protein. # # The algorithm used in this method is a modified version of: # http://isoelectric.ovh.org/files/isoelectric-point-theory.html # # If you use this algorithm in subsequent works, please cite: # Kozlowski L. 2007-2012 Isoelectric Point Calculator. http://isoelectric.ovh.org # # If you use this code, please include me in addition to Kozlowski: # Kozlowski L. 2007-2012 Isoelectric Point Calculator. http://isoelectric.ovh.org # Graham Alvare - Python translation of the Isoelectric Point Calculator algoirthm def calculate_pi(first_aa, last_aa, numarg, numasp, numglu, numcys, numtyr, numhis, numlys, THRESHOLD = 0.01): first = aa_pkas [ first_aa ][0] last = aa_pkas [ last_aa ][1] pH = 6.5 pHprev = 0.0 pHnext = 14.0 # bisection algorithm from: http://isoelectric.ovh.org/files/isoelectric-point-theory.html # C-terminal charge + D charge + E charge + C charge + Y charge + H charge # + NH2charge + K charge + R charge while not ((pH - pHprev < THRESHOLD) and (pHnext - pH < THRESHOLD)): charge =-1/(1+pow(10,(first-pH))) \ - numasp/(1+pow(10,(3.9-pH))) \ - numglu/(1+pow(10,(4.07-pH))) \ - numcys/(1+pow(10,(8.18-pH))) \ - numtyr/(1+pow(10,(10.46-pH))) \ + numhis/(1+pow(10,(pH-6.04))) \ + 1/(1+pow(10,(last-8.2))) \ + numlys/(1+pow(10,(pH-10.54))) \ + numarg/(1+pow(10,(pH-12.48))) if charge < 0: # the charge is negative, we need to decrease the pH pHnext = pH pH = pH - ((pH - pHprev) / 2) else: # the charge is positive, we need to increase the pH pHprev = pH pH = pH + ((pHnext - pH)/2) return pH ######## # Protein statistic output ######## # This method prints all of the statistics for a given protein def printstats(name, seq, kDa = False, aaCount = False, TSV = True): tag = "" weight = 0 numala = 0 numarg = 0 numasp = 0 numasn = 0 numcys = 0 numglu = 0 numgln = 0 numgly = 0 numhis = 0 numile = 0 numleu = 0 numlys = 0 nummet = 0 numphe = 0 numpro = 0 numser = 0 numthr = 0 numtrp = 0 numtyr = 0 numval = 0 first_aa = "" last_aa = "" numc = 0 numh = 0 numo = 0 numn = 0 # counts B, Z, and X numasx = 0 # B numunk = 0 # X numglx = 0 # Z numaa = 0 # loop through every amino acid in the protein sequence for aa in seq: if aa == "B": numasx += 1 numaa += 1 elif aa == "X": numunk += 1 numaa += 1 elif aa == "Z": numglx += 1 numaa += 1 elif aa in aaweights: # calculate the molecular weight weight += aaweights[aa] # calculate the instance of each charged amino acid (and charge) if first_aa == "": first_aa = aa if aa == "A": numala += 1 elif aa == "R": numarg += 1 elif aa == "D": numasp += 1 elif aa == "N": numasn += 1 elif aa == "C": numcys += 1 elif aa == "E": numglu += 1 elif aa == "Q": numgln += 1 elif aa == "G": numgly += 1 elif aa == "H": numhis += 1 elif aa == "I": numile += 1 elif aa == "L": numleu += 1 elif aa == "K": numlys += 1 elif aa == "M": nummet += 1 elif aa == "F": numphe += 1 elif aa == "P": numpro += 1 elif aa == "S": numser += 1 elif aa == "T": numthr += 1 elif aa == "W": numtrp += 1 elif aa == "Y": numtyr += 1 elif aa == "V": numval += 1 numc += aa_comp[aa][0] numh += aa_comp[aa][1] numo += aa_comp[aa][2] numn += aa_comp[aa][3] last_aa = aa numaa += 1 formula = "C" + str(numc) + "H" + str(numh + 2) + "O" + str(numo + 1) + "N" + str(numn) if nummet + numcys > 0: formula += "S" + str(nummet + numcys) if kDa: weight = str(round((weight / 1000), 2)) + " kDa" else: weight = str(round((weight), 2)) + " Da" pI = round(calculate_pi(first_aa, last_aa, numarg, numasp, numglu, numcys, numtyr, numhis, numlys), 2) if TSV: if aaCount: print name + " " + str(numaa) + " " + weight + " " + str(pI) + " " + formula + " " \ + str(numala) + " " + str(numasx) + " " + str(numcys) + " " \ + str(numasp) + " " + str(numglu) + " " + str(numphe) + " " \ + str(numgly) + " " + str(numhis) + " " + str(numile) + " " \ + str(numlys) + " " + str(numleu) + " " + str(nummet) + " " \ + str(numasn) + " " + str(numpro) + " " + str(numgln) + " " \ + str(numarg) + " " + str(numser) + " " + str(numthr) + " " \ + str(numval) + " " + str(numtrp) + " " + str(numunk) + " " \ + str(numtyr) + " " + str(numglx) + " " + seq else: print name + " " + str(numaa) + " " + weight + " " + str(pI) + " " + formula + " " + seq else: print "Name: " + name print "Molecular Weight: " + weight print "Theoretical pI: " + str(pI) print "Molecular Formula: " + formula if aaCount: # required, so division will work properly numaa = float(numaa) numnonpol = numgly + numala + numval + numleu + numile + nummet + numphe + numpro numpolar = numser + numthr + numcys + numasn + numgln + numtyr + numtrp numcharge = numasp + numglu + numhis + numlys + numarg numplus = numhis + numlys + numarg numminus = numasp + numglu numother = numasx + numglx + numunk # print the relevant protein statistics print """Number of amino acids: """ + str(numaa) + """ Nonpolar side chains: """ + str(numnonpol) + " (" + str(round(numnonpol / numaa, 3)) + """) Gly(G) """ + str(numgly) + " (" + str(round(numgly / numaa, 3)) + """) Ala(A) """ + str(numala) + " (" + str(round(numala / numaa, 3)) + """) Val(V) """ + str(numval) + " (" + str(round(numval / numaa, 3)) + """) Leu(L) """ + str(numleu) + " (" + str(round(numleu / numaa, 3)) + """) Ile(I) """ + str(numile) + " (" + str(round(numile / numaa, 3)) + """) Met(M) """ + str(nummet) + " (" + str(round(nummet / numaa, 3)) + """) Phe(F) """ + str(numphe) + " (" + str(round(numphe / numaa, 3)) + """) Pro(P) """ + str(numpro) + " (" + str(round(numpro / numaa, 3)) + """) Neutral polar side chains: """ + str(numpolar) + " (" + str(round(numpolar / numaa, 3)) + """) Ser(S) """ + str(numser) + " (" + str(round(numser / numaa, 3)) + """) Thr(T) """ + str(numthr) + " (" + str(round(numthr / numaa, 3)) + """) Cys(C) """ + str(numcys) + " (" + str(round(numcys / numaa, 3)) + """) Asn(N) """ + str(numasn) + " (" + str(round(numasn / numaa, 3)) + """) Gln(Q) """ + str(numgln) + " (" + str(round(numgln / numaa, 3)) + """) Tyr(Y) """ + str(numtyr) + " (" + str(round(numtyr / numaa, 3)) + """) Trp(W) """ + str(numtrp) + " (" + str(round(numtrp / numaa, 3)) + """) Charged polar side chains: """ + str(numcharge) + " (" + str(round(numcharge / numaa, 3)) + """) Positive: """ + str(numplus) + " (" + str(round(numplus / numaa, 3)) + """) His(H) """ + str(numhis) + " (" + str(round(numhis / numaa, 3)) + """) Lys(K) """ + str(numlys) + " (" + str(round(numlys / numaa, 3)) + """) Arg(R) """ + str(numarg) + " (" + str(round(numarg / numaa, 3)) + """) Negative: """ + str(numminus) + " (" + str(round(numminus / numaa, 3)) + """) Asp(D) """ + str(numasp) + " (" + str(round(numasp / numaa, 3)) + """) Glu(E) """ + str(numglu) + " (" + str(round(numglu / numaa, 3)) + """) Other: """ + str(numother) + " (" + str(round(numother / numaa, 3)) + """) Asx(B) """ + str(numasx) + " (" + str(round(numasx / numaa, 3)) + """) Glx(Z) """ + str(numglx) + " (" + str(round(numglx / numaa, 3)) + """) Unk(X) """ + str(numunk) + " (" + str(round(numunk / numaa, 3)) + """) """ print "Sequence: " + seq print print ######## # Main method body ######## if __name__ == "__main__": # stores the start position of the file portion of the command line arguments start = 1 # stores whether you are using IMG/ER FASTA sequences IMG = False # stores whether you want the molecular weights # displayed in daltons (Da) or kilodaltons (kDa) kDa = False # stores whether an amino acid count should be # performed for each sequence. aaCount = False # whether to present the output in a TSV format TSV = False # check the presence of command line arguments while len(sys.argv) > start: if sys.argv[start].lower() in ["-i", "-img"]: IMG = True elif sys.argv[start].lower() in ["-k", "-kda"]: kDa = True elif sys.argv[start].lower() in ["-aacount"]: aaCount = True elif sys.argv[start].lower() in ["-tsv"]: TSV = True else: break start += 1 # ensure that the command was executed with proper arguments if len(sys.argv) < 2: print """Graham's protein statistic generator ------------------------------------ calculates the following statistics from FASTA protein sequences 1. the molecular weight of each FASTA sequence 2. the theoretical pI value of the FASTA sequence using the alogirthm of: Kozlowski L. 2007-2012 Isoelectric Point Calculator. http://isoelectric.ovh.org using pKa tables from: 3. the atomic composition (CHONS) of the protein 4. the total number of amino acids in each protein Usage: gmstat.py [ options ] Examples: gstat.py ecoli-k12.fsa gstat.py -kda ecoli-k12.fsa > data_save.csv gstat.py -img ecoli-k12.fsa ecoli-plasmid1.fsa gstat.py -kda -img ecoli-k12.fsa ecoli-plasmid1.fsa > data_save.csv Options: -kda --- display molecular weights in kDa -img --- display locus tags from IMG/ER FASTA files -aacount --- perform an amino acid count for each sequence -tsv --- presents the output in a TSV format """ sys.exit(1) # if we are presenting the output in a TSV format # we must print a header for the TSV output if TSV: if aaCount: if IMG: # print the header (IMG/ER) print "NAME TAG NUMAA Mol. Wt. pI COMPOSITION " \ + "A B C D E F G H " \ + "I K L M N P Q R " \ + "S T V W X Y Z SEQUENCE" else: # print the regular header print "NAME NUMAA Mol. Wt. pI COMPOSITION " \ + "A B C D E F G H " \ + "I K L M N P Q R " \ + "S T V W X Y Z SEQUENCE" else: if IMG: # print the header (IMG/ER) print "NAME TAG NUMAA Mol. Wt. pI COMPOSITION SEQUENCE" else: # print the regular header print "NAME NUMAA Mol. Wt. pI COMPOSITION SEQUENCE" name = "" sequence = "" for filename in sys.argv[start:]: # open the current FASTA file FILE_HANDLE = open(filename, 'rU') # iterate through the lines in the current FASTA file for line in FILE_HANDLE: # obtain the name of the current sequence in the file if line.startswith(">"): # if there is already a sequence in the queue, # print the sequence's statistics before going # onto the next sequence if name != "": printstats(name, sequence, kDa, aaCount, TSV) # clear the queued sequence, and read in the name # of the sequence contained on the current line name = line[1:].strip('\n\r') if IMG: # here is where you can also do any modifications to the name field # such as extracting the locus tags from an IMG/ER FASTA file # I have done just this below (because at the time of this writing # I am using IMG/ER FASTA files); however, I have made this conditional # code because not everyone using this will be using IMG/ER img = name.split(" ")[0] tag = name.split(" ")[1] name = str.join(" ", name.split(" ")[2:]) + " " + tag sequence = "" else: # add the sequence to the current sequence queue # be sure to delete any whitespace, and convert # the nucleotides to upper case sequence += re.sub(r'[\s\n\r]', '', line).upper() # print the stats for the last protein in the FASTA file. printstats(name, sequence, kDa, aaCount, TSV) # close the current FASTA file FILE_HANDLE.close()