#!/usr/bin/env python import os import os.path import random import shutil import subprocess import sys # To change this template, choose Tools | Templates # and open the template in the editor. __author__="Graham Alvare, Brian Fristensky" __date__ ="Sun Mar 18 14:57:59 CDT 2012" if __name__ == "__main__": print "PHYLIP Library module" #---------------------------------------------------------------------------- def merge_msg(outfile_path, consense = True, extra = False): ''' Merge messages from MSGFILE to the outfile generated by a Phylip program. If extra is included as a parameter, this appends a message indicating that the output is a consensus tree, whose branch lengths are the bootstrap values, NOT the actual branch lengths. ''' h_OUTFILE = open(outfile_path, "w") # First we write the contents of MSGFILE the OUTFILE. h_MSGFILE = open("MSGFILE", "r") h_OUTFILE.writelines(h_MSGFILE.readlines()) h_MSGFILE.close() # Next, we write the contents of 'outfile' to the OUTFILE h_smalloutfile = open("outfile", "r") h_OUTFILE.writelines(h_smalloutfile.readlines()) h_smalloutfile.close() # For consensus trees, add an extra message at the end if consense: h_outfile_consense = open("outfile.consense", "r") h_OUTFILE.writelines(h_outfile_consense.readlines()) h_outfile_consense.close() if extra: h_OUTFILE.write("" + '\n') h_OUTFILE.write("" + '\n') h_OUTFILE.write(">>>> THIS TREEFILE IS A CONSENSUS TREE, WHOSE BRANCH LENGTHS" + '\n') h_OUTFILE.write(">>>> ARE BOOTSTRAP VALUES, NOT ACTUAL BRANCH LENGTHS." + '\n') h_OUTFILE.write(">>>> TO GENERATE BRANCH LENGTHS" + '\n') h_OUTFILE.write(">>>> USE TREE FILE AS INPUT FOR DNAML OR OTHER PROGRAM" + '\n') h_OUTFILE.write(">>>> USING THE USERTREE OPTION" + '\n') h_OUTFILE.close() #---------------------------------------------------------------------------- def get_numseq(infile): '''Read first line of a Phylip file to find out how many sequences there are''' h_infile = open(infile, "r") NUMSEQ = h_infile.readline().split()[0] h_infile.close() return NUMSEQ #---------------------------------------------------------------------------- def do_outgroup(OUTGROUP, NUMSEQ, comfile, h_msgfile): '''Make sure OUTGROUP is not greater than NUMSEQ''' tempoutgroup = int(OUTGROUP) if tempoutgroup > 1 and tempoutgroup <= NUMSEQ: comfile.write("o" + '\n') comfile.write(OUTGROUP + '\n') else: tempoutgroup = 1 h_msgfile.write("OUTGROUP = " + str(tempoutgroup) + '\n') return tempoutgroup #---------------------------------------------------------------------------- def phylip_random(): '''Generate a random integer as needed by Phylip programs. These numbers are used by Phylip programs as seeds for a random number stream. They must be odd, in the form 4n + 1. Return value is an integer between 0 and 2e16 -1 (which is 65535). Although the Phylip Main document claims that 32-bit random numbers are acceptible, at least one program, PARS, will only take 16-bit random numbers''' #pseed = random.randint(0,4294967295) # 32-bit random number pseed = random.randint(0,65535) prand = ( ( ( int(pseed) / 4 ) * 4 ) + 1 ) return prand #---------------------------------------------------------------------------- def seqboot (INFILE, DATATYPE, RSEED, METHOD, REPLICATES, PERCENT, BLOCKSIZE, OUTWEIGHTS, OUTFORMAT, OUTFILE): '''Run Seqboot. Notes: SEQBOOT reads interleaved sequences by default, but can read sequential files using the "I" setting. By default, SEQBOOT writes datasets to outfile, but will write weights to 'outweights' if you set the "S" option. Weight files are always sequential. ''' # Make a temporary directory to run the program in STARTDIR = os.getcwd() TEMPDIR = 'SEQBOOT.' + str(os.getpid()) os.mkdir(TEMPDIR) shutil.copy(INFILE, os.path.join(TEMPDIR, 'infile')) os.chdir(TEMPDIR) #----------------- generate keyboard input to send to program ----- # Generate keyboard input for protdist and save it to SeqbootComfile. # While it is possible to pipe the output directly to seqboot, we have # discovered that on systems with a high load average, the program doesn't # always complete. Besides, it is WAY easier to debug when we save keyboard # output to a file that can be examined. comfile_h = open('SeqbootComfile', 'w') # Percent of characters to sample if PERCENT < 5 or PERCENT > 100: PERCENT = 100 comfile_h.write('%\n') comfile_h.write(str(PERCENT) + '\n') # Block size for resampling if BLOCKSIZE < 1 or BLOCKSIZE > 100: BLOCKSIZE = 1 if BLOCKSIZE > 1: comfile_h.write('B\n') comfile_h.write(str(BLOCKSIZE) + '\n') # Choose datatype type if DATATYPE == "m": # Discrete morphology or molecular markers comfile_h.write('d\n') elif DATATYPE == "r": # Restriction sites comfile_h.write('d\n') comfile_h.write('d\n') #comfile_h.write('e\n') # num. of enzymes is on 1st line of infile elif DATATYPE == "R": # Restriction sites, no enzyme number in infile comfile_h.write('d\n') comfile_h.write('d\n') elif DATATYPE == "g": # Allele frequencies comfile_h.write('d\n') comfile_h.write('d\n') comfile_h.write('d\n') elif DATATYPE == "s": # Molecular sequences pass # Choose resampling method # Only bootstrapping and jacknifing can generate weights. # Others must generate complete datasets if METHOD == "b": # bootstrap if OUTWEIGHTS == "yes": comfile_h.write('s\n') elif METHOD == "d": # delete-half jacknife comfile_h.write('j\n') if OUTWEIGHTS == "yes": comfile_h.write('s\n') elif METHOD == "ps": # permute species for each character comfile_h.write('j\n') comfile_h.write('j\n') OUTWEIGHTS = "no" elif METHOD == "po": # permute character order comfile_h.write('j\n') comfile_h.write('j\n') comfile_h.write('j\n') OUTWEIGHTS = "no" elif METHOD == "pw": # permute within species comfile_h.write('j\n') comfile_h.write('j\n') comfile_h.write('j\n') comfile_h.write('j\n') OUTWEIGHTS = "no" elif METHOD == "rew": # rewrite data comfile_h.write('j\n') comfile_h.write('j\n') comfile_h.write('j\n') comfile_h.write('j\n') comfile_h.write('j\n') OUTWEIGHTS = "no" else: OUTWEIGHTS = "no" # Number of replicates comfile_h.write('r\n') comfile_h.write(str(REPLICATES) + '\n') # Output format: interleaved (default), sequential if OUTFORMAT == 's': comfile_h.write('i\n') #accept current settings and do the analysis comfile_h.write('y\n') # Random seed, odd, of the form 4n + 1 tempseed = ( ( ( int(RSEED) / 4 ) * 4 ) + 1 ) #tempseed = phylip_random() comfile_h.write(str(tempseed) + '\n') comfile_h.close() #-------- Run seqboot, sending terminal output to /dev/null ----------- comfile_h = open('SeqbootComfile', 'r') p = subprocess.Popen(['seqboot'], stdin=comfile_h, stdout=subprocess.PIPE) p.wait() comfile_h.close() if OUTWEIGHTS == "yes": # Debug statement. #p_testinfile = subprocess.call(["nedit", "outweights"]) shutil.move('outweights', os.path.join(STARTDIR, OUTFILE)) else: # Debug statement. #p_testinfile = subprocess.call(["nedit", "outfile"]) shutil.move('outfile', os.path.join(STARTDIR, OUTFILE)) os.chdir(STARTDIR) shutil.rmtree(TEMPDIR, True) #---------------------------------------------------------------------------- def jumble(comfile, h_msgfile, NUMJUM): '''Jumble - When multiple datasets are analyzed, protpars automatically jumbles, and prompts for a random number seed for jumbling. Otherwise, jumbling must be explicitly set.''' # Random seeds, odd, of the form 4n + 1 tempjseed = str(phylip_random()) h_msgfile.write("JUMBLING SEQUENCE ORDER " + str(NUMJUM) + " ITERATIONS, SEED=" + tempjseed + '\n') comfile.write('j' + '\n') comfile.write(tempjseed + '\n') comfile.write(str(NUMJUM) + '\n') #---------------------------------------------------------------------------- def stdresample(METHOD, PERCENT, REPLICATES, BLOCKSIZE, h_msgfile, DATATYPE, OUTFORMAT): '''Run SEQBOOT to generate resampled datasets. Output is weights, used by Phylip programs to generate datasets on the fly. If you want SEQBOOT to generate actual datasets instead of weights, use weightless_resample instead.''' Bseed = phylip_random() BseedStr = str(Bseed) RepStr = str(REPLICATES) # Choose resampling methodaz1 if METHOD == "n": h_msgfile.write(" " + '\n') #shutil.copyfile("infile.temp", "infile") elif METHOD in ("b", "d"): if METHOD == "b": #differences from METHOD b h_msgfile.write("RESAMPLING: Bootstrap, " + RepStr + " REPLICATES, SEED=" + BseedStr + '\n') if int(BLOCKSIZE) > 1: h_msgfile.write('Resampling in blocks of ' + BLOCKSIZE + '\n') else: #differences from METHOD d h_msgfile.write("RESAMPLING: Delete-half Jacknifing, " + RepStr + " REPLICATES, SEED=" + BseedStr + '\n') # shared code if PERCENT < 100: h_msgfile.write('Partial Resampling: ' + PERCENT + 'percent of sites sampled' + '\n') shutil.copyfile("infile.temp", "infile") seqboot('infile.temp', DATATYPE, Bseed, METHOD, REPLICATES, PERCENT, BLOCKSIZE, 'yes', OUTFORMAT, 'weights') elif METHOD in ("ps", "po", "pw"): # print the appropriate header in the MSGFILE if METHOD == "ps": h_msgfile.write("RESAMPLING: Permute species for each character, " + RepStr + " REPLICATES, SEED=" + BseedStr + '\n') elif METHOD == "po": h_msgfile.write("RESAMPLING: Permute character order, " + RepStr + " REPLICATES, SEED=" + BseedStr + '\n') else: # pw h_msgfile.write("RESAMPLING: Permute within species, " + RepStr + " REPLICATES, SEED=" + BseedStr + '\n') # shared code seqboot('infile.temp', DATATYPE, Bseed, METHOD, REPLICATES, PERCENT, BLOCKSIZE, 'no', OUTFORMAT, 'infile') #---------------------------------------------------------------------------- def weightless_resample(METHOD, PERCENT, REPLICATES, BLOCKSIZE, msgfile_h, DATATYPE, OUTFORMAT): ''' Generate terminal input for running seqboot to generate randomized sequence files, rather than weight files and run seqboot''' Bseed = phylip_random() BseedStr = str(Bseed) RepStr = str(REPLICATES) # Choose resampling method if METHOD == "n": msgfile_h.write(" " + '\n') #shutil.copyfile("infile.temp", "infile") elif METHOD in ("b", "d", "ps", "po", "pw"): if METHOD == "ps": msgfile_h.write("RESAMPLING: Permute species for each character, " + RepStr + " REPLICATES, SEED=" + BseedStr + '\n') elif METHOD == "po": msgfile_h.write("RESAMPLING: Permute character order, " + RepStr + " REPLICATES, SEED=" + BseedStr + '\n') elif METHOD == "pw": msgfile_h.write("RESAMPLING: Permute within species, " + RepStr + " REPLICATES, SEED=" + BseedStr + '\n') else: if METHOD == "b": msgfile_h.write("RESAMPLING: Bootstrap, " + RepStr + " REPLICATES, SEED=" + BseedStr + '\n') else: msgfile_h.write("RESAMPLING: Delete-half Jacknifing, " + RepStr + " REPLICATES, SEED=" + BseedStr + '\n') # shared code if PERCENT < 100: msgfile_h.write("Partial Resampling: " + PERCENT + "percent of sites sampled" + '\n') # shared code seqboot('infile.temp', DATATYPE, Bseed, METHOD, REPLICATES, PERCENT, BLOCKSIZE, 'no', OUTFORMAT, 'infile') #---------------------------------------------------------------------------- def ufn(UFN, TEMPDIR, infile = True): '''Used by dnaml.py and protml.py. Make sure that treefile begins with number of trees on first line of file. If first line in file has parentheses, the number must be added.''' if infile: infile_h = open(os.path.join(TEMPDIR, 'intree'), 'a') else: infile_h = sys.stdout ufn_h = open (UFN, 'r') ufn_lines = ufn_h.readlines() ufn_h.close() sc_count = 0 if ufn_lines[0].find('(') > -1: for line in ufn_lines: if line.find(';') > -1: sc_count = sc_count + 1 infile_h.write(str(sc_count) + "\n") for line in ufn_lines: infile_h.write(line) infile_h.close()