#!/usr/bin/env python3 import os import subprocess import sys from random import randint # xylem_shuffle.py # This is a front end for shuffle. INFILE = str(sys.argv[1]) WINDOW = str(sys.argv[2]) OVERLAP = str(sys.argv[3]) OUTFILE = str(sys.argv[4]) INFILE_H = open(INFILE, 'r') lines_in = INFILE_H.readlines() INFILE_H.close() # SFLAG is the first character of the first line in the file. # We have to remember the character used to denote the name # line, because shuffle will indiscriminantly change it to '>' # GDE flat files can start with either '#', '%' or '"' SFLAG = lines_in[0][0] #Next, we have to convert the SFLAG character into '>', which is the # only flag character that shuffle knows about numlines=len(lines_in) for i in range(0,numlines-1): if lines_in[i][0] == SFLAG : lines_in[i] = lines_in[i].replace(SFLAG,'>') #Create a temporary input file to be read by shuffle PID = str(os.getpid()) TIFN= 'xylem_shuffle.py.in' + '.' + PID tempin = open(TIFN,'w') tempin.writelines(lines_in) tempin.close TOFN = 'xylem_shuffle.py.out' + '.' + PID # run shuffle, and delete the first 2 lines, which are message lines # that might confuse programs that read fasta files. SEED = randint(1,32767) tempin = open(TIFN,'r') tempout = open(TOFN,'w') p = subprocess.Popen(['shuffle', '-s' + str(SEED), '-w' + WINDOW, '-o' + OVERLAP], stdin=tempin, stdout=tempout) p.wait() tempin.close() tempout.close() tempout = open(TOFN,'r') lines_out = tempout.readlines() tempout.close() lines_out = lines_out[2:] # Truncate the name lines after the first blank and # add "-rand" to each name o indicate that the sequences # have been randomized. # Convert the output into GDE flat file format, using the # flag character that was found earlier to indicate either DNA or protein OUTFILE_H = open(OUTFILE, 'w') for line in lines_out: if line[0] == '>' : line = line.replace('>',SFLAG) line = line.split()[0] + '-rand\n' OUTFILE_H.write(line) OUTFILE_H.close() os.remove(TIFN) os.remove(TOFN)