#!/usr/local/bin/python """ Sept. 1, 2010, Dr. Brian Fristensky, University of Manitoba dbsout.py - Extract hit lines or ID#'s from BLAST or FASTA output Send output to files or windows. Synopsis: dbsout.py infile [-e ethreshold] -d destination [outfile] IFN - output from BLAST or FASTA -e - only select output for which the ETHRESHOLD is <= ethreshold -d - destination: one of the folloeing: textedit - open output files in text editor specified by the $GDE_TEXTEDIT environment variable files - write to files, using the basename specified by destination. GDE - send hit lines to a GDE window outfile - basename for outputfile(s) @modified: May 26 2010 @author: Dale Hamel @contact: umhameld@cc.umanitoba.ca """ import sys import os import re import shutil import sys blib = os.environ.get("BIRCHPYLIB") sys.path.append(blib) from birchlib import Birchmod from birchlib import Argument PROGRAM = "dbsout.py : " USAGE = "\n\tUSAGE: dbsout.py infile [-e ethreshold] -d destination [outfile]" BM = Birchmod(PROGRAM, USAGE) class Parameters: """ Wrapper class for command line parameters By default, ETHRESHOLD is set to 10000, so that all hits will be returned, if -e is not set at the command line """ def __init__(self): """ Initializes arguments: IFN="" ETHRESHOLD=float(10000) DESTINATION="" OFN="" PID="" TWOIDS="" Then calls read_args() to fill in their values from command line """ self.IFN = "" self.ETHRESHOLD = float(10000) self.DESTINATION = "" self.OFN = "" self.PID = "" self.TWOIDS = "" self.read_args() def read_args(self): """ Read command line arguments into a Parameter object """ infile = Argument("", str, BM) ethresh = Argument("-e", float, BM) dest = Argument("-d", str, BM) outfile = Argument("", str, BM) infile.set_position(1) outfile.set_position(len(sys.argv) - 1) ethresh.set_optional() outfile.set_optional() try: self.IFN = infile.fetch() self.OFN = outfile.fetch() self.DESTINATION = dest.fetch() ethresh = ethresh.fetch() if(ethresh != None): self.ETHRESHOLD = ethresh except: BM.printusage() # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - def READHITS(P, HITS): """ Read hit lines from FASTA or BLAST output file """ def isGI(LINE): """ return true if line is a BLAST-style hit eg. gi|169080|gb|AAA33662.1| disease resistance respo ( 175) 1163 295.7 5.7e-79 """ RESULT = 'false' RESULT = re.match('^[\w]{2,}\|\S*\|', LINE) return RESULT def isFASTA(LINE): """ return true if line is a FASTA-style hit eg. AF141131 - Helianthus annuus cultivar Line HA ( 439) [3] 564 146.8 9.3e-35 """ RESULT = 'false' RESULT = re.match('^[^>][\w]* - *', LINE) return RESULT def EVALUE(LINE): """ Return the E value from a hit line """ TOKENS = LINE.split(" ") # For floating point numbers <= 1e-100, BLAST # truncates the evalue to something line 'e-100' # This will cause an error when dbsout.py tries # to convert to a floating point number. # First, we try concatenating a '1' to the beginning # of the string. If that still produces an exception, # we assume the E Value is 0. The worst that can # happen is that we show an extra hit, and a bad # E value should be seen upon inspection of the output. ESTR = TOKENS[len(TOKENS) - 1] if ESTR[0] == 'e': ESTR = '1' + ESTR try: E = float(ESTR) except ValueError: E = 0 return E try: FILE = open(P.IFN, 'r') P.FILETYPE = "" except: BM.file_error(P.IFN) #Find the first hit line, in either BLAST or FASTA format LINE = FILE.readline() FOUND = 0 while not (LINE == '' or FOUND) : if isGI(LINE) : P.FILETYPE = 'BLAST' FOUND = 1 elif isFASTA(LINE) : P.FILETYPE = 'FASTA' FOUND = 1 else: LINE = FILE.readline() # Keep reading lines until a non-hit line or end of file FINISHED = 0 if P.FILETYPE == 'BLAST' : while not (LINE == '' or FINISHED): if isGI(LINE) : LINE = LINE.strip(" ") if EVALUE(LINE) <= P.ETHRESHOLD : HITS.append(LINE) LINE = FILE.readline() else : FINISHED = 1 elif P.FILETYPE == 'FASTA' : while not (LINE == '' or FINISHED): if isFASTA(LINE) : LINE = LINE.strip(" ") if EVALUE(LINE) <= P.ETHRESHOLD : HITS.append(LINE) LINE = FILE.readline() else : FINISHED = 1 FILE.close() return # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - class Identifiers: """ For a given database, a list of IDs from hit lines """ def __init__(self): """ Initializes arguments: db="" list=[] ofn="" """ self.db = "" self.list = [] self.ofn = "" def writeFile(self, ONAME): """ @param ONAME: Name of the output file @type ONAME:str """ self.ofn = ONAME + '.' + self.db OUTFILE = open(self.ofn, 'w') for J in self.list: OUTFILE.write(J + "\n") OUTFILE.close() return # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - def PARSEBLAST(P, HITS, ID1, ID2): """ Parse database identifiers from BLAST hit lines """ # Parse the first hit line to find out how many sets of # identifiers there are, and which databases they are from TOKENS = HITS[0].split("|") P.TWOIDS = 0 P.TWOIDS = re.match('^[\w]{2,}\|\S*\|[\w]{2,}\|\S*\|', HITS[0]) ID1.db = TOKENS[0] if P.TWOIDS: ID2.db = TOKENS[2] # Now, process the entire set of hits. Extract identifiers # from each hit line and add them to the ID list(s). for J in HITS: TOKENS = J.split("|") ID1.list.append(TOKENS[1]) if P.TWOIDS and len(TOKENS) >= 4: ID2.list.append(TOKENS[3]) return # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - def PARSEFASTA(HITS, ID1): """ Parse database identifiers from FASTA hit lines """ ID1.db = 'nam' for J in HITS: TOKENS = J.split(" ") ID1.list.append(TOKENS[0]) return # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - def RUNTEXTEDIT(OFN): """ Run the texteditor in the background and remove the temporary file when done """ COMMAND = '(`choose_edit_wrapper.sh` ' + OFN + '; $RM_CMD ' + OFN + ')&' # It's surprising how many issues there are with launching multiple # files in a text editor. choose_edit_wrapper.sh takes care of # these issues. #COMMAND = '($GDE_TEXTEDIT ' + OFN + '; $RM_CMD ' + OFN + ')&' os.system(COMMAND) return # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - def RUNDGDE(P, ID1, ID2): """ Run the dgde in the background and remove the temporary file when done """ FLATFILE = 'dbsout.' + P.PID + '.flat' COMMAND = 'python $BIRCH/script/list2flat.py ' + ID1.ofn + ' ' + FLATFILE os.system(COMMAND) if P.TWOIDS: TEMPFILE = P.PID + '.temp' COMMAND = 'python $BIRCH/script/list2flat.py ' + ID2.ofn + ' ' + TEMPFILE os.system(COMMAND) COMMAND = 'cat ' + TEMPFILE + '>> ' + FLATFILE os.system(COMMAND) os.remove(TEMPFILE) COMMAND = '(dgde ' + FLATFILE + '; $RM_CMD ' + FLATFILE + ')&' os.system(COMMAND) return #======================== MAIN PROCEDURE ========================== def main(): """ Called when not in documentation mode. """ P = Parameters () # Read hit lines from FASTA or BLAST output file HITS = [] READHITS(P, HITS) # Parse out the database identifiers from hit lines. # BLAST hit lines may have two lists of identifiers. # FASTA hit lines have one. ID1 = Identifiers () ID2 = Identifiers () P.PID = str(os.getpid()) if P.FILETYPE == 'BLAST': PARSEBLAST(P, HITS, ID1, ID2) elif P.FILETYPE == 'FASTA': PARSEFASTA(HITS, ID1) # Write the output to a file, or send it to a window, as specified # in -d destination if P.DESTINATION == 'textedit': TEMPOFN = P.PID + '.' + 'outfile' shutil.copy(P.IFN, TEMPOFN) RUNTEXTEDIT(TEMPOFN) ID1.writeFile(P.PID) RUNTEXTEDIT(ID1.ofn) if P.TWOIDS: ID2.writeFile(P.PID) RUNTEXTEDIT(ID2.ofn) RUNDGDE(P, ID1, ID2) elif P.DESTINATION == 'files': shutil.copy(P.IFN, P.OFN) ID1.writeFile(P.OFN) if P.TWOIDS: ID2.writeFile(P.OFN) elif P.DESTINATION == 'gde': print 'dbsout.py: !!! Output to GDE not yet implemented' BM.exit_success() if (BM.documentor() or "-test" in sys.argv): pass else: main()