#!/usr/bin/env python import subprocess import sys from random import randint # xylem_shuffle.py # This is a front end for shuffle. INFILE = str(sys.argv[1]) WINDOW = str(sys.argv[2]) OVERLAP = str(sys.argv[3]) OUTFILE = str(sys.argv[4]) INFILE_H = open(INFILE, 'r') lines_in = INFILE_H.readlines() INFILE_H.close() # SFLAG is the first character of the first line in the file. # We have to remember the character used to denote the name # line, because shuffle will indiscriminantly change it to '>' # GDE flat files can start with either '#', '%' or '"' SFLAG = lines_in[0][0] #Next, we have to convert the SFLAG character into '>', which is the # only flag character that shuffle knows about numlines=len(lines_in) for i in range(0,numlines-1): if lines_in[i][0] == SFLAG : lines_in[i] = lines_in[i].replace(SFLAG,'>') # run shuffle, and delete the first 2 lines, which are message lines # that might confuse programs that read fasta files. SEED = randint(1,32767) p = subprocess.Popen(['shuffle', '-s' + str(SEED), '-w' + WINDOW, '-o' + OVERLAP], stdin=subprocess.PIPE, stdout=subprocess.PIPE) p.stdin.writelines(lines_in) p.stdin.close() lines_out = p.stdout.readlines() p.wait() lines_out = lines_out[2:] # Truncate the name lines after the first blank and # add "-rand" to each name o indicate that the sequences # have been randomized. # Convert the output into GDE flat file format, using the # flag character that was found earlier to indicate either DNA or protein OUTFILE_H = open(OUTFILE, 'w') for line in lines_out: if line[0] == '>' : line = line.replace('>',SFLAG) line = line.split()[0] + '-rand\n' OUTFILE_H.write(line) OUTFILE_H.close()