#!/usr/bin/env python """ dbsout.py - Extract hit lines or ID#'s from BLAST, GENPEPT or FASTA output Send output to files or windows. Synopsis: dbsout.py --infile [--ethreshold] --destination destination [--outfile name] --infile - output from BLAST, GENPEPT or FASTA --ethreshold - only select output for which the ETHRESHOLD is <= ethreshold --destination destination - destination: one of the following: textedit - open output files in text editor specified by the $BL_TextEditor environment variable files - write to files, using the basename specified by destination. blnfetch - BioLegato interface for nucleic acid GI numbers blpfetch - BioLegato interface for amino acid GI numbers --outfile name - basename for outputfile(s) @modified: June 3, 2016 @author: Brian Fristensky @contact: frist@cc.umanitoba.ca """ """ optparse is deprecated in favor of argparse as of Python 2.7. However, since 2.7 is not always present on many systems, at this writing, it is safer to stick with optparse for now. It should be easy to change later, since the syntax is very similar between argparse and optparse. from optparse import OptionParser """ from optparse import OptionParser import os import re import shutil import sys blib = os.environ.get("BIRCHPYLIB") sys.path.append(blib) from birchlib import Birchmod from birchlib import Argument DEBUG = True PROGRAM = "dbsout.py : " USAGE = "\n\tUSAGE: dbsout.py --infile [--ethreshold float] --destination destination [--outfile filename]" BM = Birchmod(PROGRAM, USAGE) class Parameters: """ Wrapper class for command line parameters By default, ETHRESHOLD is set to 10000, so that all hits will be returned, if -e is not set at the command line """ def __init__(self): """ Initializes arguments: IFN="" ETHRESHOLD=float(10000) DESTINATION="" OFN="" PID="" TWOIDS="" Then calls read_args() to fill in their values from command line """ self.IFN = "" self.ETHRESHOLD = float(10000) self.DESTINATION = "" self.OFN = "" self.PID = str(os.getpid()) self.TWOIDS = "" self.read_args() if DEBUG : print('------------ Parameters from command line ------') print(' INFILE: ' + self.IFN) print(' ETHRESHOLD: ' + str(self.ETHRESHOLD)) print(' DESTINATION: ' + self.DESTINATION) print(' OUFILE: ' + self.OFN) print(' PID: ' + self.PID) print(' TWOIDS: ' + self.TWOIDS) def read_args(self): """ Read command line arguments into a Parameter object """ parser = OptionParser() parser.add_option("--infile", dest="infile", action="store", type="string", default="", help="Blast or Fasta output in .tsv format") parser.add_option("--outfile", dest="outfile", action="store", type="string", default="", help="basename for output files") parser.add_option("--ethreshold", dest="ethreshold", action="store", type="float", default=float(10000), help="Threshold e-value for hits to include in outfile") parser.add_option("--destination", dest="destination", action="store", type="string", default="", help="tells whether to launch output in a program or write to file") (options, args) = parser.parse_args() self.IFN = options.infile self.OFN = options.outfile self.ETHRESHOLD = options.ethreshold self.DESTINATION = options.destination PARAMSOKAY = True if (self.IFN == "") : PARAMSOKAY = False if not PARAMSOKAY : print(USAGE) # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - def READHITS(P, HITS): """ Read hit lines from FASTA or BLAST output file """ def isGI(LINE): """ return true if line is a BLAST-style hit eg. gi|169080|gb|AAA33662.1| disease resistance respo ( 175) 1163 295.7 5.7e-79 """ RESULT = 'false' RESULT = re.match('^[\w]{2,}\|\S*\|', LINE) return RESULT def isFASTA(LINE): """ return true if line is a FASTA-style hit eg. AF141131 - Helianthus annuus cultivar Line HA ( 439) [3] 564 146.8 9.3e-35 """ RESULT = 'false' RESULT = re.match('^[^>][\w]* - *', LINE) return RESULT def isGENPEPT(LINE): """ return true if line is a GENPEPT-style hit eg. J02593_1 SRAAFP 213874 Sea raven (Hemitripter ( 195) [f] 1369 310.6 3.5e-82 """ RESULT = 'false' RESULT = re.match('^[^>][\w]*_[1-6] [\w]* ', LINE) return RESULT def EVALUE(LINE): """ Return the E value from a hit line """ TOKENS = LINE.split(" ") # For floating point numbers <= 1e-100, BLAST # truncates the evalue to something line 'e-100' # This will cause an error when dbsout.py tries # to convert to a floating point number. # First, we try concatenating a '1' to the beginning # of the string. If that still produces an exception, # we assume the E Value is 0. The worst that can # happen is that we show an extra hit, and a bad # E value should be seen upon inspection of the output. ESTR = TOKENS[len(TOKENS) - 1] if ESTR[0] == 'e': ESTR = '1' + ESTR try: E = float(ESTR) except ValueError: E = 0 return E try: FILE = open(P.IFN, 'r') P.FILETYPE = "" except: BM.file_error(P.IFN) #Find the first hit line, in either BLAST or FASTA format LINE = FILE.readline() FOUND = 0 while not (LINE == '' or FOUND) : if isGI(LINE) : P.FILETYPE = 'BLAST' FOUND = 1 elif isFASTA(LINE) : P.FILETYPE = 'FASTA' FOUND = 1 elif isGENPEPT(LINE) : P.FILETYPE = 'GENPEPT' FOUND = 1 else: LINE = FILE.readline() # Keep reading lines until a non-hit line or end of file FINISHED = 0 if P.FILETYPE == 'BLAST' : while not (LINE == '' or FINISHED): if isGI(LINE) : LINE = LINE.strip(" ") if EVALUE(LINE) <= P.ETHRESHOLD : HITS.append(LINE) LINE = FILE.readline() else : FINISHED = 1 elif P.FILETYPE == 'FASTA' : while not (LINE == '' or FINISHED): if isFASTA(LINE) : LINE = LINE.strip(" ") if EVALUE(LINE) <= P.ETHRESHOLD : HITS.append(LINE) LINE = FILE.readline() else : FINISHED = 1 elif P.FILETYPE == 'GENPEPT' : while not (LINE == '' or FINISHED): if isGENPEPT(LINE) : LINE = LINE.strip(" ") if EVALUE(LINE) <= P.ETHRESHOLD : HITS.append(LINE) LINE = FILE.readline() else : FINISHED = 1 FILE.close() return # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - class Identifiers: """ For a given database, a list of IDs from hit lines """ def __init__(self): """ Initializes arguments: db="" list=[] ofn="" """ self.db = "" self.list = [] self.ofn = "" def writeFile(self, ONAME): """ @param ONAME: Name of the output file @type ONAME:str A lot of thought went into the issue of whether or not to automatically append a .csv or .tsv file extension. While that might be convenient for opening files in BioLegato (which requires a .csv or .tsv extension), it creates problems if someone wants to click on a file in a file manager. The results are unpredictable, although generally, files with that extension would open in the default spreadsheet program. This introduces lots of potential problems if the user tries to save a file from the spreadsheet. Some programs will enclose all fields in double quotes. Some may default to other formats that will render the file unreadable by BioLegat later. The final decision is to NOT automatically add an extension. """ self.ofn = ONAME + '.' + self.db OUTFILE = open(self.ofn, 'w') for J in self.list: OUTFILE.write(J + "\n") OUTFILE.close() return # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - def PARSEBLAST(P, HITS, ID1, ID2): """ Parse database identifiers from BLAST hit lines """ # Parse the first hit line to find out how many sets of # identifiers there are, and which databases they are from TOKENS = HITS[0].split("|") P.TWOIDS = 0 P.TWOIDS = re.match('^[\w]{2,}\|\S*\|[\w]{2,}\|\S*\|', HITS[0]) ID1.db = TOKENS[0] if P.TWOIDS: ID2.db = TOKENS[2] # Now, process the entire set of hits. Extract identifiers # from each hit line and add them to the ID list(s). for J in HITS: TOKENS = J.split("|") ID1.list.append(TOKENS[1]) if P.TWOIDS and len(TOKENS) >= 4: ID2.list.append(TOKENS[3]) return # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - def PARSEFASTA(HITS, ID1): """ Parse database identifiers from FASTA hit lines """ ID1.db = 'nam' for J in HITS: TOKENS = J.split(" ") ID1.list.append(TOKENS[0]) return # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - def PARSEGENPEPT(P, HITS, ID1, ID2): """ Parse database identifiers from GENPEPT hit lines """ # Parse the first hit line to find out how many sets of # identifiers there are TOKENS = HITS[0].split(" ") P.TWOIDS = 0 P.TWOIDS = re.match('^[^>][\w]*_[1-6] [\w]* ', HITS[0]) ID1.db = 'gp' if P.TWOIDS: ID2.db = 'gb' # Now, process the entire set of hits. Extract identifiers # from each hit line and add them to the ID list(s). for J in HITS: TOKENS = J.split(" ") ID1.list.append(TOKENS[0]) if P.TWOIDS and len(TOKENS) >= 4: ID2.list.append(TOKENS[1]) return # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - def RUNTEXTEDIT(OFN): """ Run the texteditor in the background and remove the temporary file when done """ COMMAND = '(nohup `choose_edit_wrapper.sh` ' + OFN + '; $RM_CMD ' + OFN + ' > /dev/null)&' # It's surprising how many issues there are with launching multiple # files in a text editor. choose_edit_wrapper.sh takes care of # these issues. #COMMAND = '($BL_TextEditor ' + OFN + '; $RM_CMD ' + OFN + ')&' os.system(COMMAND) return # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - def RunBioLegato(DESTINATION, IDFILE): """ Run the blnfetch or blpfetch in the background and remove the temporary file when done """ COMMAND = '(nohup ' + DESTINATION + ' ' + IDFILE + '; rm -f ' + IDFILE + ' > /dev/null)&' os.system(COMMAND) return #======================== MAIN PROCEDURE ========================== def main(): """ Called when not in documentation mode. """ P = Parameters () #sys.exit() # Read hit lines from FASTA,BLAST or GENPEPT output file HITS = [] READHITS(P, HITS) # Parse out the database identifiers from hit lines. # BLAST and GENPEPT hit lines may have two lists of identifiers. # FASTA hit lines have one. ID1 = Identifiers () ID2 = Identifiers () if P.FILETYPE == 'BLAST': PARSEBLAST(P, HITS, ID1, ID2) elif P.FILETYPE == 'FASTA': PARSEFASTA(HITS, ID1) elif P.FILETYPE == 'GENPEPT': PARSEGENPEPT(P, HITS, ID1, ID2) # Write the output to a file, or send it to a window, as specified # in --destination if P.DESTINATION == 'textedit': TEMPOFN = P.PID + '.' + 'outfile' shutil.copy(P.IFN, TEMPOFN) RUNTEXTEDIT(TEMPOFN) ID1.writeFile(P.PID) RUNTEXTEDIT(ID1.ofn) if P.TWOIDS: ID2.writeFile(P.PID) RUNTEXTEDIT(ID2.ofn) elif P.DESTINATION == 'files': shutil.copy(P.IFN, P.OFN) ID1.writeFile(P.OFN) if P.TWOIDS: ID2.writeFile(P.OFN) elif ( P.DESTINATION in ['blnfetch','blpfetch'] ): TEMPOFN = P.PID + '.' + 'outfile' shutil.copy(P.IFN, TEMPOFN) RUNTEXTEDIT(TEMPOFN) ID1.writeFile(P.PID) RunBioLegato(P.DESTINATION,ID1.ofn) if P.TWOIDS: ID2.writeFile(P.PID) RunBioLegato(P.DESTINATION,ID2.ofn) BM.exit_success() if (BM.documentor() or "-test" in sys.argv): pass else: main()