#!/usr/bin/env python import re import os import os.path import subprocess import sys stdopt = [["jumble"], ["quickadd"], ["treefile"]] cleanflag = "-noclean" seed = "$$`date +%M%S`" remaining = 1 jumbles = 10 nice = 10 cleanUp = True # The spaces in the echo and grep are required because of a "feature" that # causes /bin/sh echo to consume ANY leading argument that begins with -n. option_count = 1 while option_count < len(sys.argv[1:]): option = sys.argv[option_count] if not option.startswith("-"): break elif option == "-": option_count += 1 break else: if option == "-noclean" or option == "-c": cleanUp = False elif option == "-max" or option == "-m": jumbles = sys.argv[option_count] option_count += 1 elif option == "-nice" or option == "-n": nice = sys.argv[option_count] option_count += 1 elif option == "-seed" or option == "-s": seed = sys.argv[option_count] option_count += 1 elif option == "-boots" or option == "-b": remaining = sys.argv[option_count] option_count += 1 else: print "Bad flag: $*" option_count = len(sys.argv) break option_count += 1 # subprocess.call(["fastDNAml_boot.py", "-boots", remaining, "-max", jumbles, cleanflag, "-nice", nice, "$@"]) # $@ while remaining > 0: if option_count + 1 == len(sys.argv): opts = stdopt elif option_count + 2 == len(sys.argv): opts = stdopt.append(sys.argv[option_count + 2].split()) else: print """ Usage: fastDNAml_boot.py [-boots nboot] [-seed seed] \\ [-max maxjumble] [-nice nicevalue] [-noclean] \\ in_file n_best [ "'"'"dnaml_opt1 [ | dnaml_opt2 [...]]"'"'" ] For the current bootstrap seed, the sequence input order is jumbled (up to maxjumble times) until the same best tree is found n_best times. The output files are then reduced to a summary of the scores produced by jumbling, and one example of the best tree. The number process is then repeated with new bootstrap seeds until nboot samples have been analyzed. Boot, jumble, treefile and quickadd are included by the script and should not be specified by the user or in the data file. AdditionalJfastDNAml program options are enclosed in quotes, and separated by vertical bars (|). Flags and parameters: in_file -- name of the input data file n_best -- input order is jumbled (up to maxjumble times) until same tree is found n_best times -boots nboot -- number of different bootstrap samples (Default=1) -seed seed -- seed for first bootstrap (Default is based on the process ID and time of day) -max maxjumble -- maximum attempts at replicating inferred tree (Default=10) -nice nicevalue -- run fastDNAml with specified nice value (Default=10) -noclean -- inhibits cleanup of the output files for the individual seeds """ exit() if cleanUp: cleanflag="" if os.path.exists(sys.argv[option_count]): root = re.sub("\.phy(lip)?$", "", sys.argv[option_count]) infile = sys.argv[option_count] elif os.path.exists(sys.argv[option_count] + ".phy"): root = sys.argv[option_count] infile = sys.argv[option_count] + ".phy" elif os.path.exists(sys.argv[option_count] + ".phylip"): root = sys.argv[option_count] infile = sys.argv[option_count] + ".phylip" else: print "fastDNAml_boot.py: Unable to find input file: " + sys.argv[option_count] exit() # `echo $seed | awk '{printf("%09d",$1)}'` seed = str(seed) if len(seed) < 9: seed = ("000000000" + seed)[-9:] out = os.path.basename(root + "_" + seed) # Check for reuse of same random seed: if (not os.path.exists(out + ".tree")) and (not os.path.exists(out + ".out")): # Loop over jumble orders: loopable = True while loopable: #test `ls -d $out.[0-9]* 2>/dev/null | wc -l` -gt 0; then files=os.listdir(os.getcwd()) #ignore files starting with '.' using list comprehension files=[filename for filename in files if re.search(out + "\.[0-9]", filename) and os.path.isfile(filename)] if cleanUp and os.path.exists(out + ".summary"): print "" print "fastDNAml_boot.py: Jumbling stopped by existence of summary file:" print out + ".summary" print "" jumbles = 0 nBest = 0 nJumble = 0 elif len(files) > 0: #nBest=`grep '^Ln Likelihood' $out.[0-9]* /dev/null | # sed -e 's/^.*:Ln Likelihood =\(.*\)$/\1/g' | sort -nr +0 | # awk 'BEGIN{c=0} NR==1{b=$1-0.001} $1>=b{c++} END{print c}'` for filename in files: nJumble = 0 line_list = [] h_file = open(filename, "r") for line in h_file: # nJumble=`grep '^Ln Likelihood' $out.[0-9]* /dev/null | wc -l` if (line.startswith('Ln Likelihood')): nJumble += 1 line_list.append(re.replace('^.*:Ln Likelihood =(.*)$', '\1')) h_file.close() line_list = line_list.sort().reverse() # nBest = line_list >>>> awk 'BEGIN{c=0} NR==1{b=$1-0.001} $1>=b{c++} END{print c}'` nBest = 0 b = line_list[0] - 0.001 for line in line_list[1:]: if int(line) >= b: nBest += 1 else: nBest=0 nJumble=0 loopable = nBest < sys.argv[option_count + 1] and nJumble < jumbles if loopable: # eval "jumble < $infile | $opts | # nice -$nice out.PID fastDNAml $out" >/dev/null || exit" infile_h = open(infile, "r") p_jumble = subprocess.Popen(["bootstrap", seed], stdin=infile_h, stdout=subprocess.PIPE) # pipe the output to the optional extra commands curr_stdout = p_jumble.stdout for option_cmd in opts: p_option = subprocess.Popen(option_cmd, stdin=curr_stdout, stdout=subprocess.PIPE) curr_stdout = p_option.stdout os.nice(0 - int(nice)) subprocess.call(["out.PID", "fastDNAml", out], stdin=curr_stdout) os.nice(0) infile_h.close() if cleanUp: subprocess.call(["clean_jumbles", out]) remaining -= 1