#!/usr/bin/env python

import birchenv
import birchscript
import re
import os
import os.path
import shutil
import subprocess
import sys

#Version 5/23/2008
# Run fasta programs
#Synopsis: fastaout.csh -m # outfile

PID = os.getpid() #process id

# set global variables
if int(sys.argv[2]) == 3:
    DESTINATION = "GDE"
    GDEOUT = str(PID) + ".gen"
else:
    if sys.argv[4] == "":
        DESTINATION = "TEXTEDIT"
    else:
        DESTINATION = "FILES"
        OUTNAME = sys.argv[4]

OUTFILE = sys.argv[3]

# Parse out the accession numbers or names of hits
# and write to name.$PID. 

# Normally, Pearson/FASTA format
# files contain a simple name in the form ">name". FASTA
# programs put the name at the beginning of lines
# of the form "name - rest of the line"
# GenPept files are a special case. They have complex
# name lines, with various ID's. The GenBank accession
# number has to be parsed out of these lines. The lines look like
# >gi|5881102|gb|AAD55053.1|AF170915_1 (AF170915) green fluorescent protein [Expression vector FRMwg]
# FASTY makes matters worse by truncating the line to make room for
# statistics, which sometimes cuts off the accession # that's in
# parentheses. So we have to be able to deal with two types of
# lines, one with the ACCESSION number in parentheses, the other
# in the form "|AF170915_1"


if DESTINATION == "TEXTEDIT":
    # Special code for text editors used by GDE and scripts called by GDE
    # Nedit crashes in some Linux systems due to libraries set in BIRCHLIBS.
    # nedit_wrapper unsets LD_LIBRARY_PATH before calling nedit.
    # gedit opens all files in a single window. gedit_wrapper.sh forces
    # gedit to open each file in different window.
    # choose_edit_wrapper.sh returns the name of the wrapper to use
    # for each editor, or just returns BL_TextEditor if there is no
    # wrapper.
    shutil.move(OUTFILE, "outfile." + PID)
    #extract names of hits into a file and open the file in text editor
    # egrep -e '^* - *' outfile.$PID | egrep -v -e '^>>' | egrep -v -e '^#' |\
    #    cut -d" " -f1 > name.$PID

    h_outfile = open("outfile." + PID, "r")
    h_names = open("name." + PID, "w")
    for line in h_outfile:
        if (re.search(" -", line)) and (not line.startswith(">>")) and (not line.startswith("#")):
            h_names.write(line.split(" ")[0])
    h_names.close()
    h_outfile.close()

    if os.path.exists("name." + PID):
        birchscript.Cleanrun([[birchenv.BL_TextEditor, "name." + PID, "-geometry", "15x40"]], ["name." + PID], True)
    else:
        os.remove("name." + PID)
    #read outfile into texteditor
    birchscript.Cleanrun([[birchenv.BL_TextEditor, "outfile." + PID]], ["outfile." + PID], True)
elif DESTINATION == "FILES":
    #extract names of hits into a file
    # egrep -e "^* - *" $OUTFILE | egrep -v -e '^>>' | egrep -v -e '^#' |\
    #    cut -d" " -f1 > $OUTNAME.names
    h_outfile = open(OUTFILE, "r")
    h_names = open(OUTNAME + ".names", "w")
    for line in h_outfile:
        if (re.search(" -", line)) and (not line.startswith(">>")) and (not line.startswith("#")):
            h_names.write(line.split(" ")[0])
    h_names.close()
    h_outfile.close()

    shutil.move(OUTFILE, OUTNAME + ".fasta")
elif DESTINATION == "GDE":
    # store sequence names
    #head -4 $OUTFILE | tail +3 > namelines.$PID
    # We can no longer use tail +n because the syntax is no longer consistent
    # between Unix and Linux
    h_outfile = open(OUTFILE, "r")
    outfile_lines = h_outfile.readlines()
    h_outfile.close()

    NAME1 = outfile_lines[2].split(" ")[0]
    NAME2 = outfile_lines[3].split(" ")[0]
    # remove leading and trailing lines from outfile and convert periods
    # to dashes, to represent gaps

    # TAILLINES =  ( len(outfile_lines) - 7 )
    # tail -$TAILLINES $OUTFILE | grep -v Elapsed | sed "s/[.]/-/g" > seqfile.$PID

    h_seqfile = open("seqfile." + PID, "w")
    for line in outfile_lines[7:]:
        if not re.search("Elapsed", line):
            line.translate(string.maketrans(".", "-"))
            h_seqfile.write(line)
    h_seqfile.close()
    
    h_wrpfile = open ("wrpfile." + PID, "w")
    print >> h_wrpfile, '>' + NAME1
    subprocess.call(["readseq", "-i1", "-fPlain", "-pipe", "seqfile." + PID], stdout=h_wrpfile)

    print >> h_wrpfile, '>' + NAME2
    subprocess.call(["readseq", "-i2", "-fPlain", "-pipe", "seqfile." + PID], stdout=h_wrpfile)

    # re-format as GenBank file for GDE input
    subprocess.call(["readseq", "-a", "-f2", "-o=" + GDEOUT, "wrpfile." + PID])
    birchscript.Cleanrun([["gde", GDEOUT]], [GDEOUT], True)

    # Clean up.
    for file in os.listdir(os.getcwd()):
        if os.path.splitext(file)[1] == ("." + PID):
            os.remove(os.path.join(os.getcwd(), file))