#!/usr/bin/env python import os import os.path import phylip import shutil import socket import subprocess import sys import time #Version February 5, 2020 # Run dna distance programs as a command #Synopsis: dnadist.py infile dmethod transratio method replicates blocksize percent\ # tconmeth power subrep global negbranch outgroup jumble numjum\ # termout printdata outfile treefile #Convert arguments to variables INFILE = sys.argv[1] DMETHOD = sys.argv[2] TRANSRATIO = sys.argv[3] METHOD = sys.argv[4] REPLICATES = sys.argv[5] BLOCKSIZE = sys.argv[6] PERCENT = sys.argv[7] TCONMETH = sys.argv[8] POWER = sys.argv[9] SUBREP = sys.argv[10] GLOBAL = sys.argv[11] NEGBRANCH = sys.argv[12] OUTGROUP = sys.argv[13] JUMBLE = sys.argv[14] NUMJUM = sys.argv[15] TERMOUT = sys.argv[16] PRINTDATA = sys.argv[17] OUTFILE = sys.argv[18] TREEFILE = sys.argv[19] #Send program output to the terminal termout_h = open(TERMOUT, 'w') # Remember where we started STARTDIR = os.getcwd() # Make a temporary directory in which to run the program TEMPDIR = 'DNADIST.' + str(os.getpid()) os.mkdir(TEMPDIR) shutil.copy(INFILE, os.path.join(TEMPDIR, 'infile.temp')) os.chdir(TEMPDIR) # Debug statement - did infile.temp get written to TEMPDIR? #p_testinfile = subprocess.call(["nedit", "infile.temp"]) msgfile_h = open('MSGFILE', 'w') msgfile_h.write("DNA Distance Matrix Phylogeny Methods\n") msgfile_h.write("\n") msgfile_h.write("--------------------- DISTANCE MATRIX ---------------------\n") msgfile_h.write("\n") if METHOD == 'n': msgfile_h.write(" \n") shutil.copyfile('infile.temp', 'infile') if METHOD in ('b', 'd'): phylip.stdresample(METHOD, PERCENT, REPLICATES, '1', msgfile_h, 's', 'i') elif METHOD in ("ps", "po", "pw"): phylip.weightless_resample(METHOD, PERCENT, REPLICATES, '1', msgfile_h, 's', 'i') # Debug: Allows us to examine the contents of infile produced by either by seqboot, # or just copied, if METHOD = n. #p_testinfile = subprocess.call(["nedit", "infile"]) # This statement reads from infile, so we have to run it before # running the program, which also reads infile. NUMSEQ = phylip.get_numseq("infile") #---------------------------- DNADIST ----------------------------------- termout_h = open(TERMOUT, 'w') #----------------- generate keyboard input to send to DNADIST ----- # Generate keyboard input for protdist and save it to ProtdistComfile. # While it is possible to pipe the output directly to protdist, we have # discovered that on systems with a high load average, the program doesn't # always complete. Besides, it is WAY easier to debug when we save keyboard # output to a file that can be examined. comfile_h = open('DnadistComfile', 'w') if METHOD in ('b', 'd'): comfile_h.write("m\n") comfile_h.write("w\n") comfile_h.write(str(REPLICATES) + "\n") #comfile_h.write(str(RSEED) + "\n") elif METHOD in ("ps", "po", "pw"): comfile_h.write("m\n") comfile_h.write("d\n") comfile_h.write(str(REPLICATES) + "\n") # Choose method for constructing distance matrix if DMETHOD == "k": # Kimura 2-parameter msgfile_h.write("Distance matrix constructed using Kimura 2-parameter method\n") comfile_h.write("d\n") elif DMETHOD == "j": # Jukes - Cantor msgfile_h.write("Distance matrix constructed using Jukes-Cantor method\n") comfile_h.write("d\n") comfile_h.write("d\n") elif DMETHOD == "l": # LogDet msgfile_h.write("Distance matrix constructed using Log of the determinant of the joint\n") msgfile_h.write("nucleotide frequencies\n") comfile_h.write("d\n") comfile_h.write("d\n") comfile_h.write("d\n") else: # default (DMETHOD == "f") # F84 msgfile_h.write("Distance matrix constructed using F84 method\n") if DMETHOD in ("f", "k"): # Transition/transversion ratio if TRANSRATIO != 2.0: comfile_h.write("t\n") comfile_h.write(str(TRANSRATIO) + "\n") msgfile_h.write("Transition/Transversion ratio = " + str(TRANSRATIO) + "\n") #accept current settings and do the analysis comfile_h.write("y" + "\n") comfile_h.close() # Debug: Allows us to examine the contents of DnadistComfile #p_testinfile = subprocess.call(["nedit", "DnadistComfile"]) os.nice(4) #----------------- Run DNADIST ----- comfile_h = open('DnadistComfile', 'r') p_MATRIX = subprocess.Popen(['dnadist'], stdin=comfile_h) p_MATRIX.wait() comfile_h.close() shutil.move('outfile', 'infile') # Allows us to examine the contents of matrix file produced by dnadist. #p_testinfile = subprocess.call(["nedit", "infile"]) msgfile_h.write("--------------------- CONSTRUCTING TREE(S) ---------------------\n") msgfile_h.write("\n") #----------------- generate keyboard input to send to distance tree program ----- # Choose method for constructing distance matrix # w = weighbor, weighted Neighbor-Joining # F = FITCH, Fitch-Margoliash method # f = FITCH, Minimum evolution # K = KITSCH, Fitch-Margoliash method # k = KITSCH, Minimum evolution method # N = Neighbor-joining # U = Neighbor-joining # default = FITCH, Fitch-Margoliash method if TCONMETH in ("w"): PROGRAM = "weighbor" msgfile_h.write('Please cite:\n') msgfile_h.write('WEIGHBOR - Weighted Neighbor Joining.\n') msgfile_h.write('William J. Bruno, Nicholas D. Socci, and Aaron L. Halpern\n') msgfile_h.write('Weighted Neighbor Joining: A Likelihood-Based Approach to\n') msgfile_h.write('Distance-Based Phylogeny Reconstruction,\n') msgfile_h.write('Mol. Biol. Evol. 17 (1): 189-197 (2000).\n') msgfile_h.write(" \n") PROGRAM = "weighbor" #Read length of sequence alignment from INFILE h_infile = open("infile.temp", "r") SEQLENGTH = h_infile.readline().split()[1] h_infile.close() print("SEQLENGTH = " + str(SEQLENGTH)) start_time = time.time() subprocess.call(["weighbor", "-L", str(SEQLENGTH), "-b", "4", "-i", "infile", "-o", "outtree", "-v"], stdout=open(TERMOUT, "a")) shutil.move("weighbor.out", "outfile") time_elapsed = time.time() - start_time h_smalloutfile = open("outfile", "a") h_smalloutfile.write("Execution times on " + socket.gethostname() + ": " + str(time_elapsed)) h_smalloutfile.close() msgfile_h.close() else: if TCONMETH in ("K", "k"): PROGRAM = "kitsch" elif TCONMETH in ("N", "U"): PROGRAM = "neighbor" else: PROGRAM = "fitch" #----------------- Run FITCH, KITCH or NEIGHBOR ----- termout_h = open(TERMOUT, 'a') # Generate keyboard input for dnadist and save it to DistanceComfile. comfile_h = open('DistanceComfile', 'w') # Jumble - When multiple datasets are analyzed, some programs automatically # jumbles, and prompts for a random number seed for jumbling. Othersise, # jumbling must be explicitly set. From a scripting perspecitve, it is safest therefore # to set jumbling first, so that we don't need to handle a bunch of special cases in # which the program asks for jumbling at a later time. if METHOD != "n": JUMBLE = "J" if JUMBLE == "J": phylip.jumble(comfile_h, msgfile_h, NUMJUM) if TCONMETH in ("k", "f"): comfile_h.write("d\n") elif TCONMETH == "U": comfile_h.write("n\n") RSEED = phylip.phylip_random() if METHOD in ("b", "d"): comfile_h.write("m\n") comfile_h.write(str(REPLICATES) + '\n') comfile_h.write(str(RSEED) + '\n') elif METHOD in ("ps", "po", "pw"): comfile_h.write("m\n") comfile_h.write(str(REPLICATES) + '\n') comfile_h.write(str(RSEED) + '\n') #case "n": DO NOTHING # Subreplicates # Phylip documentation doesn't tell us what this parameter does, so # fo rnoe we leave it commented out. #if SUBREP == 'n': # comfile_h.write('s\n') if PROGRAM in ('fitch', 'kitsch'): # Global rearrangements if GLOBAL == 'y': comfile_h.write('g\n') # Negative branch lengths if NEGBRANCH == 'y': comfile_h.write('-\n') # Outgroup OUTGROUP = phylip.do_outgroup(OUTGROUP, NUMSEQ, comfile_h, msgfile_h) # Should sequence data be printed? if PRINTDATA == "y": comfile_h.write("1\n") # When resampling, turn off printing trees to outfile if METHOD in ('b', 'd', 'ps', 'po', 'pw'): comfile_h.write("3\n") #accept current settings and do the analysis comfile_h.write("y" + '\n') comfile_h.close() # Debug: Allows us to examine the contents of DistanceComfile #p_testinfile = subprocess.call(["nedit", "DistanceComfile"]) msgfile_h.write(PROGRAM + ": SEED= " + str(RSEED)) #----------------- Run FITCH, KITCH or NEIGHBOR ----- comfile_h = open('DistanceComfile', 'r') start_time = time.time() p_TREE = subprocess.Popen([PROGRAM], stdin=comfile_h) p_TREE.wait() comfile_h.close() termout_h.close() msgfile_h.close() time_elapsed = time.time() - start_time outfile_h = open('outfile', 'a') # There is no standard way to time a subprocess in Python. outfile_h.write("Elapsed time on " + os.uname()[1] + ": " + str(time_elapsed) + " seconds") outfile_h.close() # Debug: Allows us to examine the contents of DnadistComfile #p_testinfile = subprocess.call(["nedit", "outfile"]) os.nice(0) #----------- Return results to calling directory---------------- # When using resampling, filter the treefile through # consense to generate an unrooted consensus tree. if METHOD in ('b', 'd', 'ps', 'po', 'pw'): outfile_h = open('outfile', 'a') outfile_h.write(" \n") outfile_h.write('-------------------------------------------\n') outfile_h.close() subprocess.call(['consense.py', 'outtree', 'e', '1', str(OUTGROUP), 'n', 'outfile.consense', TREEFILE]) shutil.move(TREEFILE, os.path.join(STARTDIR, TREEFILE)) phylip.merge_msg(os.path.join(STARTDIR, OUTFILE)) else: shutil.move('outtree', os.path.join(STARTDIR, TREEFILE)) phylip.merge_msg(os.path.join(STARTDIR, OUTFILE), False) os.chdir(STARTDIR) shutil.rmtree(TEMPDIR) print(PROGRAM + " completed.")