#!/usr/bin/env python import birchenv import birchscript import re import os import os.path import shutil import subprocess import sys #Version 5/23/2008 # Run fasta programs #Synopsis: fastaout.csh -m # outfile PID = os.getpid() #process id # set global variables if int(sys.argv[2]) == 3: DESTINATION = "GDE" GDEOUT = str(PID) + ".gen" else: if sys.argv[4] == "": DESTINATION = "TEXTEDIT" else: DESTINATION = "FILES" OUTNAME = sys.argv[4] OUTFILE = sys.argv[3] # Parse out the accession numbers or names of hits # and write to name.$PID. # Normally, Pearson/FASTA format # files contain a simple name in the form ">name". FASTA # programs put the name at the beginning of lines # of the form "name - rest of the line" # GenPept files are a special case. They have complex # name lines, with various ID's. The GenBank accession # number has to be parsed out of these lines. The lines look like # >gi|5881102|gb|AAD55053.1|AF170915_1 (AF170915) green fluorescent protein [Expression vector FRMwg] # FASTY makes matters worse by truncating the line to make room for # statistics, which sometimes cuts off the accession # that's in # parentheses. So we have to be able to deal with two types of # lines, one with the ACCESSION number in parentheses, the other # in the form "|AF170915_1" if DESTINATION == "TEXTEDIT": # Special code for text editors used by GDE and scripts called by GDE # Nedit crashes in some Linux systems due to libraries set in BIRCHLIBS. # nedit_wrapper unsets LD_LIBRARY_PATH before calling nedit. # gedit opens all files in a single window. gedit_wrapper.sh forces # gedit to open each file in different window. # choose_edit_wrapper.sh returns the name of the wrapper to use # for each editor, or just returns BL_TextEditor if there is no # wrapper. shutil.move(OUTFILE, "outfile." + PID) #extract names of hits into a file and open the file in text editor # egrep -e '^* - *' outfile.$PID | egrep -v -e '^>>' | egrep -v -e '^#' |\ # cut -d" " -f1 > name.$PID h_outfile = open("outfile." + PID, "r") h_names = open("name." + PID, "w") for line in h_outfile: if (re.search(" -", line)) and (not line.startswith(">>")) and (not line.startswith("#")): h_names.write(line.split(" ")[0]) h_names.close() h_outfile.close() if os.path.exists("name." + PID): birchscript.Cleanrun([[birchenv.BL_TextEditor, "name." + PID, "-geometry", "15x40"]], ["name." + PID], True) else: os.remove("name." + PID) #read outfile into texteditor birchscript.Cleanrun([[birchenv.BL_TextEditor, "outfile." + PID]], ["outfile." + PID], True) elif DESTINATION == "FILES": #extract names of hits into a file # egrep -e "^* - *" $OUTFILE | egrep -v -e '^>>' | egrep -v -e '^#' |\ # cut -d" " -f1 > $OUTNAME.names h_outfile = open(OUTFILE, "r") h_names = open(OUTNAME + ".names", "w") for line in h_outfile: if (re.search(" -", line)) and (not line.startswith(">>")) and (not line.startswith("#")): h_names.write(line.split(" ")[0]) h_names.close() h_outfile.close() shutil.move(OUTFILE, OUTNAME + ".fasta") elif DESTINATION == "GDE": # store sequence names #head -4 $OUTFILE | tail +3 > namelines.$PID # We can no longer use tail +n because the syntax is no longer consistent # between Unix and Linux h_outfile = open(OUTFILE, "r") outfile_lines = h_outfile.readlines() h_outfile.close() NAME1 = outfile_lines[2].split(" ")[0] NAME2 = outfile_lines[3].split(" ")[0] # remove leading and trailing lines from outfile and convert periods # to dashes, to represent gaps # TAILLINES = ( len(outfile_lines) - 7 ) # tail -$TAILLINES $OUTFILE | grep -v Elapsed | sed "s/[.]/-/g" > seqfile.$PID h_seqfile = open("seqfile." + PID, "w") for line in outfile_lines[7:]: if not re.search("Elapsed", line): line.translate(string.maketrans(".", "-")) h_seqfile.write(line) h_seqfile.close() h_wrpfile = open ("wrpfile." + PID, "w") print >> h_wrpfile, '>' + NAME1 subprocess.call(["readseq", "-i1", "-fPlain", "-pipe", "seqfile." + PID], stdout=h_wrpfile) print >> h_wrpfile, '>' + NAME2 subprocess.call(["readseq", "-i2", "-fPlain", "-pipe", "seqfile." + PID], stdout=h_wrpfile) # re-format as GenBank file for GDE input subprocess.call(["readseq", "-a", "-f2", "-o=" + GDEOUT, "wrpfile." + PID]) birchscript.Cleanrun([["gde", GDEOUT]], [GDEOUT], True) # Clean up. for file in os.listdir(os.getcwd()): if os.path.splitext(file)[1] == ("." + PID): os.remove(os.path.join(os.getcwd(), file))