#!/usr/bin/env python3

import os
import subprocess
import sys
from random import randint

# xylem_shuffle.py                  
# This is a front end for shuffle. 

INFILE  = str(sys.argv[1])
WINDOW  = str(sys.argv[2])
OVERLAP = str(sys.argv[3])
OUTFILE = str(sys.argv[4])

INFILE_H = open(INFILE, 'r')
lines_in = INFILE_H.readlines()
INFILE_H.close()

# SFLAG is the first character of the first line in the file. 
# We have to remember the character used to denote the name
# line, because shuffle will indiscriminantly change it to '>'
# GDE flat files can start with either '#', '%' or '"'
SFLAG = lines_in[0][0]

#Next, we have to convert the SFLAG character into '>', which is the
# only flag character that shuffle knows about
numlines=len(lines_in)
for i in range(0,numlines-1):
    if lines_in[i][0] == SFLAG :
       lines_in[i] = lines_in[i].replace(SFLAG,'>')

#Create a temporary input file to be read by shuffle
PID = str(os.getpid())
TIFN=  'xylem_shuffle.py.in' + '.' + PID
tempin = open(TIFN,'w')
tempin.writelines(lines_in)
tempin.close
TOFN = 'xylem_shuffle.py.out' + '.' + PID  

# run shuffle, and delete the first 2 lines, which are message lines
# that might confuse programs that read fasta files.
SEED = randint(1,32767)
tempin = open(TIFN,'r')
tempout = open(TOFN,'w')
p = subprocess.Popen(['shuffle', '-s' + str(SEED), '-w' + WINDOW, '-o' + OVERLAP], stdin=tempin, stdout=tempout)
p.wait()
tempin.close()
tempout.close()
tempout = open(TOFN,'r')
lines_out = tempout.readlines()
tempout.close()
lines_out = lines_out[2:]


# Truncate the name lines after the first blank and
# add "-rand" to each name o indicate that the sequences
# have been randomized.
# Convert the output into GDE flat file format, using the
# flag character that was found earlier to indicate either DNA or protein
OUTFILE_H = open(OUTFILE, 'w')
for line in lines_out:
    if line[0] == '>' :
       line = line.replace('>',SFLAG)
       line = line.split()[0] + '-rand\n'
    OUTFILE_H.write(line)
OUTFILE_H.close()

os.remove(TIFN)
os.remove(TOFN)