#!/usr/bin/env python

import subprocess
import sys
from random import randint

# xylem_shuffle.py                  
# This is a front end for shuffle. 

INFILE  = str(sys.argv[1])
WINDOW  = str(sys.argv[2])
OVERLAP = str(sys.argv[3])
OUTFILE = str(sys.argv[4])

INFILE_H = open(INFILE, 'r')
lines_in = INFILE_H.readlines()
INFILE_H.close()

# SFLAG is the first character of the first line in the file. 
# We have to remember the character used to denote the name
# line, because shuffle will indiscriminantly change it to '>'
# GDE flat files can start with either '#', '%' or '"'
SFLAG = lines_in[0][0]

#Next, we have to convert the SFLAG character into '>', which is the
# only flag character that shuffle knows about
numlines=len(lines_in)
for i in range(0,numlines-1):
    if lines_in[i][0] == SFLAG :
       lines_in[i] = lines_in[i].replace(SFLAG,'>')

# run shuffle, and delete the first 2 lines, which are message lines
# that might confuse programs that read fasta files.
SEED = randint(1,32767)
p = subprocess.Popen(['shuffle', '-s' + str(SEED), '-w' + WINDOW, '-o' + OVERLAP], stdin=subprocess.PIPE, stdout=subprocess.PIPE)
p.stdin.writelines(lines_in)
p.stdin.close()
lines_out = p.stdout.readlines()
p.wait()
lines_out = lines_out[2:]


# Truncate the name lines after the first blank and
# add "-rand" to each name o indicate that the sequences
# have been randomized.
# Convert the output into GDE flat file format, using the
# flag character that was found earlier to indicate either DNA or protein
OUTFILE_H = open(OUTFILE, 'w')
for line in lines_out:
    if line[0] == '>' :
       line = line.replace('>',SFLAG)
       line = line.split()[0] + '-rand\n'
    OUTFILE_H.write(line)
OUTFILE_H.close()