#!/usr/bin/env python

import birchscript
import re
import os
import os.path
import subprocess
import sys

# mrtrans.csh                  September  1,  2010
# This is a front end for mrtrans. It makes sure that the names of
# the sequences in PROTFILE and DNAFILE are the same, and re-orders
# the sequences in DNAFILE, if necessary, to be in the same order
# as in PROTFILE.
# This script assumes that sequence names in PROTFILE are IDENTICAL to
# the corresponding names in DNAFILE.

PROTFILE = sys.argv[1]
DNAFILE  = sys.argv[2]
JOBID    = str(os.getpid())

# ---------------- Reformat files for use by mrtrans binaries ----------------------------------
# Modified for compatability with both GDE and biolegato. bioLegato
# can't read pseudo-Genbank files created by readseq, so output now
# goes to a GDE flat file, which both can read.
# sed edits out ":CDS" or "_CDS" that is added by FEATURES
# tr gets rid of extra > and ASCII 128 added by mrtrans
# sed lops off the  comma and extra junk readseq adds to the name line
# including the " from >name..." that needs to be removed.
# mrtrans has been modified to allow sequence names longer than
# 9 char. Install mrtrans from mrtrans.tar.
# Finally, we have to translate the protein sequence to 
# uppercase, which is required by mrtrans.

#readseq -a -f=Pearson -pipe $DNAFILE |sed "s/[:_]CDS/_/" |sed "s/,.*//" | cut -f1 -d" " > $JOBID.dna; 
#grep '>' $JOBID.dna  | cut -c2-80 | cut -f1 -d" " > $JOBID.dna.nam
dnanam_lines = []
p_dna = subprocess.Popen(["readseq", "-a", "-f=Pearson",       "-pipe", DNAFILE  ], stdout=subprocess.PIPE)
for line in p_dna.stdout:
    line = re.sub(",.*", "", re.sub("[:_]CDS", "_", line)).split(" ")[0]
    if len(line) > 0 and line[0] == '>':
        dnanam_lines += line[1:]

#readseq -a -f=Pearson -C -pipe $PROTFILE |sed "s/[:_]CDS/_/" | sed "s/,.*//" >$JOBID.prot;
#grep '>' $JOBID.prot | cut -c2-80 | cut -f1 -d" " > $JOBID.pro.nam
pronam_lines = []
missing_lines = []
p_pro = subprocess.Popen(["readseq", "-a", "-f=Pearson", "-C", "-pipe", PROTFILE ], stdout=subprocess.PIPE)
for line in p_pro.stdout:
    line = re.sub(",.*", "", re.sub("[:_]CDS", "_", line))
    if len(line) > 0 and line[0] == '>':
        pronam_lines += line[1:]
# ----------------- Make sure names are consistent between DNA and Protein files -------------------
#Make a list of sequence names from each file.
#sort < $JOBID.pro.nam > $JOBID.pro.nam.sorted
#sort < $JOBID.dna.nam > $JOBID.dna.nam.sorted
#comm -23 $JOBID.pro.nam.sorted $JOBID.dna.nam.sorted > $JOBID.missing
        if line not in dnanam_lines:
            missing_lines += line
#p_pro.close()


print "Making sure all names in protein file have a counterpart in dna file..."

if len(missing_lines) == 0:
    # All protein sequences have a DNA counterpart.
    # Re-order the DNA file into the same order as the protein file.
    # A $JOBID.dna.num is the same as $JOBID.dna.nam, except that
    # the former begins with a line number for each name in the
    # latter. For each protein sequence, the appropriate line
    # number is chosen using grep, and readseq pulls out that
    # sequence from $DNAFILE.
    print "Re-ordering the DNA file to correspond to order of sequences"
    print "in protein file."

    dna_jobid_lines = []

    # cat -n $JOBID.dna.nam |tr -s " " " " > $JOBID.dna.num

    line_number = 1
    for line in dnanam_lines:
        dna_jobid_lines += re.sub(" +", " ", line)
        line_number += 1
    dnanam_lines = None

    h_dnareordered = open(JOBID + ".dna.reordered", "a")
    for name in pronam_lines:
        #set LINENUM = `grep -w "$name" $JOBID.dna.num | cut -f1 -d"	" |tr -d ' ' `
        LINENUM = 1
        for line in dna_jobid_lines:
            if re.search("\b" + name + "\b", line):
                break
            else:
                LINENUM += 1
        subprocess.call(["readseq", "-pipe", "-C", "-i" + LINENUM, "-fPearson", JOBID + ".dna"], stdout=h_dnareordered)
    h_dnareordered.close()

else:
    print 'The following sequences are present in the protein file'
    print 'but missing from the DNA file:'
    sys.stdout.writelines(missing_lines)

# ------------------------ Run mrtrans binary and cleanup output so bioLegato can read it
#mrtrans $JOBID.prot $JOBID.dna.reordered | sed "s/> from >/>/" |tr -d '' | readseq -pipe -a -f8 | sed "s/>>/#/"|sed 's/ from.*//' > $JOBID.flat;
p_mrtrans = subprocess.Popen(["mrtrans", JOBID + ".prot", JOBID + ".dna.reordered"], stdout=subprocess.PIPE)

p_readseq = subprocess.Popen(["readseq", "-pipe", "-a", "-f8"], stdin=subprocess.PIPE, stdout=subprocess.PIPE)
for line in p_mrtrans.stdout:
    #re.sub("> from >", ">", line).translate(None, '')
    re.sub("> from >", ">", line).translate(None, chr(128))
    p_readseq.stdin.write(line)
p_readseq.stdin.close()

h_output = open(JOBID + ".flat", "w")
for line in p_readseq.stdout:
    h_output.write(re.sub(" from.*", "", re.sub(">>", "#", line)))
h_output.close()

# Launch bioLegato in the background
ls_dir = [filename for filename in os.listdir(os.getcwd()) if os.path.isfile(filename) and os.path.splitext(filename)[0] == JOBID]
birchscript.Cleanrun([["blnalign", JOBID + ".flat"]], ls_dir, True)

# Delete temporary files
#$RM_CMD $JOBID.prot $JOBID.dna* $JOBID.*.nam $JOBID.*.sorted $JOBID.missing