#!/usr/bin/env python import re import os import os.path import phylip import shutil import subprocess import sys import time #Version November 17, 2014 # Run restml as a command #Synopsis: restml.py infile utree_ufn method replicates percent\ # allsites speedy global sitenum sitelen outgroup jumble numjum\ # termout printdata outfile treefile #Convert arguments to variables INFILE = sys.argv[1] #print 'INFILE: ' + INFILE # bltree: If no file has been chosen for UFN, UFN will be blank. # Simply substituting $UFN into the command for restml.csh would # therefore result in a missing parameter. Here we concatenate # UTREE and UFN in the form UTREE_UFN. restml.csh has been # modified to read both parameters from the UTREE variable, # and UFN is no longer passed as a separate variable. UTREEPARAM = sys.argv[2] UTREE = UTREEPARAM[0] UFN = UTREEPARAM[1:] METHOD = sys.argv[3] REPLICATES = sys.argv[4] PERCENT = sys.argv[5] ALLSITES = sys.argv[6] SPEEDY = sys.argv[7] GLOBAL = sys.argv[8] SITENUM = sys.argv[9] SITELEN = sys.argv[10] OUTGROUP = sys.argv[11] JUMBLE = sys.argv[12] NUMJUM = sys.argv[13] TERMOUT = sys.argv[14] PRINTDATA = sys.argv[15] OUTFILE = sys.argv[16] TREEFILE = sys.argv[17] #print 'UTREE: ' + UTREE #print 'UFN: ' + UFN #print 'METHOD: ' + METHOD # Make a temporary directory in which to run the program STARTDIR = os.getcwd() TEMPDIR = 'RESTML.' + str(os.getpid()) os.mkdir(TEMPDIR) shutil.copyfile(INFILE, os.path.join(TEMPDIR, 'infile.temp')) if UTREE == 'y': # Turn off bootstrapping when evaluating a user tree. # If bootstrapping is on, an empty file may be generated # which could cause drawtree or drawgram to loop infinitely METHOD = 'n' # Make sure that treefile begins with number of trees on first # line of file. If first line in file has parentheses, the # number must be added. phylip.ufn(UFN, TEMPDIR) os.chdir(TEMPDIR) # Debug statement. #p_testinfile = subprocess.call(["nedit", "infile.temp"]) #----------------- generate keyboard input to send to program ----- msgfile_h = open("MSGFILE", "w") #----------------- generated resampled datasets, if specified ----- # Choose resampling method # NOTE!!: Unlike most other programs in PHYLIP, RESTML can NOT use weights # when analyzing multiple datasets. Therefore, seqboot.csh must generate bootstrapped # datasets, rather than weights. # Also note that the 'r' option to seqboot.csh tells it that the datafile # contains an extra number, telling the number of restriction enzymes used. if METHOD == 'n': shutil.move('infile.temp', 'infile') else: phylip.weightless_resample(METHOD, PERCENT, REPLICATES, '1', msgfile_h, 'r', 'i') # RESTML expects a third number to appear on the first line of infile, # indicating the number of restriction enzymes used. This has to be added # to the first line of EACH dataset, when there are multiple datasets in # infile. shutil.move('infile', 'infile.bak') h_infile_bak = open('infile.bak', 'r') h_infile = open('infile', 'w') for line in h_infile_bak: rawline = line.strip() exp1 = '( *[0-9][0-9]* *[0-9][0-9]*)' #exp2 = '\g<1>' if re.match(exp1,rawline) : h_infile.write(rawline + ' ' + SITENUM + '\n') else: h_infile.write(rawline + '\n') h_infile_bak.close() h_infile.close() # Debug statement. #p_testinfile = subprocess.call(["nedit", "infile"]) # This statement reads from infile, so we have to run it before # running TREEPROGRAM, which also reads infile. NUMSEQ = phylip.get_numseq("infile") #----------------- Run RESTML ----- #----------------- generate keyboard input to send to restml program ----- comfile_h = open('RestMLComfile', 'w') # !!!!!!!!! Workaround for bug in restml3.69 !!!!!!!!!!!!! # # This section should be deleted, and the corresponding section # for SITELEN below, uncommented, once the bug is fixed. # # Problem: Site length - Broken in restml 3.69. Attempting to set this # parameter has no effect. # # Workaround: Joe Felsenstein says that the 'l' option does work # if the usertree option has been selected. So we set the # usertree option, set sitelen, and then unset usertree. # If usertree needs to be set, that will happen normally # as below. if int(SITELEN) < 1 or int(SITELEN) > 50: SITELEN = 6 comfile_h.write('u\n') comfile_h.write('l\n') comfile_h.write(str(SITELEN) + '\n') comfile_h.write('u\n') #msgfile_h.write( 'SITELENGTH = ' + str(SITELEN)) # !!!!!!!!!!!! end workaround if UTREE == 'y': comfile_h.write('u\n') msgfile_h.write( "--------------------- RESTML ---------------------") msgfile_h.write( "") # RESTML 3.69 documentation claims to be able to read sequential # format but sequential files result in an EOF error. Use Interleaved. # Input file is not interleaved #comfile_h.write('i\n') # Jumble - When multiple datasets are analyzed, some programs automatically # jumbles, and prompts for a random number seed for jumbling. Othersise, # jumbling must be explicitly set. From a scripting perspecitve, it is safest therefore # to set jumbling first, so that we don't need to handle a bunch of special cases in # which the program asks for jumbling at a later time. if METHOD != "n": JUMBLE = "J" if JUMBLE == "J": phylip.jumble(comfile_h, msgfile_h, NUMJUM) # RESTML doesn't seem to prompt for a random number seed. We'll keep # this code in case that changes in the future. #RSEED = phylip.phylip_random() #msgfile_h.write( PROGRAM + ": SEED= " + str(RSEED)) if METHOD in ("b", "d"): comfile_h.write('m\n') # comfile_h.write('w\n') comfile_h.write(REPLICATES + '\n') # comfile_h.write(str(RSEED) + '\n') elif METHOD in ("ps", "po", "pw"): comfile_h.write('m\n') # comfile_h.write('d\n') comfile_h.write(REPLICATES + '\n') # comfile_h.write(str(RSEED) + '\n') # All sites are detected if ALLSITES == 'y': comfile_h.write('a\n') msgfile_h.write( 'Assuming all sites detected') #else: # msgfile_h.write( 'Assuming NOT all sites detected') # Speedier but rougher analysis if SPEEDY == 'n': comfile_h.write('s\n') msgfile_h.write( 'Doing thorough analysis') else: msgfile_h.write( 'Doing speedy but rougher analysis') # Global rearrangements if GLOBAL == 'y': comfile_h.write('g\n') msgfile_h.write( 'Doing global rearrangements') else: msgfile_h.write( 'No global rearrangements done') # Site length - Broken in restml 3.69. Attempting to set this # parameter has no effect. #if int(SITELEN) < 1 or int(SITELEN) > 50: # SITELEN = 6 #comfile_h.write('l\n') #comfile_h.write(str(SITELEN) + '\n') #msgfile_h.write( 'SITELENGTH = ' + str(SITELEN)) # Outgroup OUTGROUP = phylip.do_outgroup(OUTGROUP, NUMSEQ, comfile_h, msgfile_h) # Should sequence data be printed? if PRINTDATA == 'y': comfile_h.write('1\n') # When resampling, turn off printing trees to outfile if METHOD in ('b', 'd', 'ps', 'po', 'pw'): comfile_h.write('3\n') #accept current settings and do the analysis comfile_h.write('y\n') #----------------- Finish RESTML ----- comfile_h.close() # Debug: Allows us to examine the contents of RestdistComfile #p_testinfile = subprocess.call(["nedit", "RestdistComfile"]) comfile_h = open('RestMLComfile', 'r') termout_h = open(TERMOUT, 'w') #os.nice(8) start_time = time.time() p_PROGRAM = subprocess.Popen(['restml'], stdin=comfile_h) p_PROGRAM.wait() termout_h.close() msgfile_h.close() time_elapsed = time.time() - start_time #os.nice(0) outfile_h = open('outfile', 'a') outfile_h.write("Execution times on " + os.uname()[1] + ": " + str(time_elapsed)) outfile_h.close() # Debug statement. #p_testinfile = subprocess.call(["nedit", "outfile"]) #----------- Return results to calling directory---------------- # When using resampling, filter the treefile through # consense to generate an unrooted consensus tree. if METHOD in ('b', 'd', 'ps', 'po', 'pw'): outfile_h = open('outfile', 'a') outfile_h.write(" \n") outfile_h.write('-------------------------------------------\n') outfile_h.close() subprocess.call(['consense.py', 'outtree', 'e', '1', str(OUTGROUP), 'n', 'outfile.consense', TREEFILE]) shutil.move(TREEFILE, os.path.join(STARTDIR, TREEFILE)) phylip.merge_msg(os.path.join(STARTDIR, OUTFILE)) else: shutil.move('outtree', os.path.join(STARTDIR, TREEFILE)) phylip.merge_msg(os.path.join(STARTDIR, OUTFILE), False) os.chdir(STARTDIR) shutil.rmtree(TEMPDIR) print("RESTML completed.")