#!/usr/bin/env python """ dbsout.py - Extract hit lines or ID#'s from BLAST, GENPEPT or FASTA output Send output to files or windows. Synopsis: dbsout.py infile [-e ethreshold] -d destination [outfile] IFN - output from BLAST, GENPEPT or FASTA -e - only select output for which the ETHRESHOLD is <= ethreshold -d - destination: one of the folloeing: textedit - open output files in text editor specified by the $BL_TextEditor environment variable files - write to files, using the basename specified by destination. blnfetch - BioLegato interface for nucleic acid GI numbers blpfetch - BioLegato interface for amino acid GI numbers outfile - basename for outputfile(s) @modified: October 6, 2016 @author: Brian Fristensky @contact: frist@cc.umanitoba.ca """ import sys import os import re import shutil import sys blib = os.environ.get("BIRCHPYLIB") sys.path.append(blib) from birchlib import Birchmod from birchlib import Argument PROGRAM = "dbsout.py : " USAGE = "\n\tUSAGE: dbsout.py infile [-e ethreshold] -d destination [outfile]" BM = Birchmod(PROGRAM, USAGE) class Parameters: """ Wrapper class for command line parameters By default, ETHRESHOLD is set to 10000, so that all hits will be returned, if -e is not set at the command line """ def __init__(self): """ Initializes arguments: IFN="" ETHRESHOLD=float(10000) DESTINATION="" OFN="" PID="" TWOIDS="" Then calls read_args() to fill in their values from command line """ self.IFN = "" self.ETHRESHOLD = float(10000) self.DESTINATION = "" self.OFN = "" self.PID = "" self.TWOIDS = "" self.read_args() def read_args(self): """ Read command line arguments into a Parameter object """ infile = Argument("", str, BM) ethresh = Argument("-e", float, BM) dest = Argument("-d", str, BM) outfile = Argument("", str, BM) infile.set_position(1) outfile.set_position(len(sys.argv) - 1) ethresh.set_optional() outfile.set_optional() try: self.IFN = infile.fetch() self.OFN = outfile.fetch() self.DESTINATION = dest.fetch() ethresh = ethresh.fetch() if(ethresh != None): self.ETHRESHOLD = ethresh except: BM.printusage() # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - def READHITS(P, HITS): """ Read hit lines from FASTA or BLAST output file """ def isBLAST(LINE): """ return true if line is a BLAST-style hit eg. gi|169080|gb|AAA33662.1| disease resistance respo ( 175) 1163 295.7 5.7e-79 """ RESULT = 'false' RESULT = re.match('^[\w]{2,}\|\S*\|', LINE) return RESULT def isFASTA(LINE): """ return true if line is a FASTA-style hit eg. AF141131 - Helianthus annuus cultivar Line HA ( 439) [3] 564 146.8 9.3e-35 """ RESULT = 'false' RESULT = re.match('^[^>][\w]* - *', LINE) return RESULT def isGENPEPT(LINE): """ return true if line is a GENPEPT-style hit eg. J02593_1 SRAAFP 213874 Sea raven (Hemitripter ( 195) [f] 1369 310.6 3.5e-82 """ RESULT = 'false' RESULT = re.match('^[^>][\w]*_[1-6] [\w]* ', LINE) return RESULT def EVALUE(LINE): """ Return the E value from a hit line """ TOKENS = LINE.split(" ") # For floating point numbers <= 1e-100, BLAST # truncates the evalue to something line 'e-100' # This will cause an error when dbsout.py tries # to convert to a floating point number. # First, we try concatenating a '1' to the beginning # of the string. If that still produces an exception, # we assume the E Value is 0. The worst that can # happen is that we show an extra hit, and a bad # E value should be seen upon inspection of the output. ESTR = TOKENS[len(TOKENS) - 1] if ESTR[0] == 'e': ESTR = '1' + ESTR try: E = float(ESTR) except ValueError: E = 0 return E try: FILE = open(P.IFN, 'r') P.FILETYPE = "" except: BM.file_error(P.IFN) #Find the first hit line, in either BLAST or FASTA format LINE = FILE.readline() FOUND = 0 while not (LINE == '' or FOUND) : if isBLAST(LINE) : P.FILETYPE = 'BLAST' FOUND = 1 elif isFASTA(LINE) : P.FILETYPE = 'FASTA' FOUND = 1 elif isGENPEPT(LINE) : P.FILETYPE = 'GENPEPT' FOUND = 1 else: LINE = FILE.readline() # Keep reading lines until a non-hit line or end of file FINISHED = 0 if P.FILETYPE == 'BLAST' : while not (LINE == '' or FINISHED): if isBLAST(LINE) : LINE = LINE.strip(" ") if EVALUE(LINE) <= P.ETHRESHOLD : HITS.append(LINE) LINE = FILE.readline() else : FINISHED = 1 elif P.FILETYPE == 'FASTA' : while not (LINE == '' or FINISHED): if isFASTA(LINE) : LINE = LINE.strip(" ") if EVALUE(LINE) <= P.ETHRESHOLD : HITS.append(LINE) LINE = FILE.readline() else : FINISHED = 1 elif P.FILETYPE == 'GENPEPT' : while not (LINE == '' or FINISHED): if isGENPEPT(LINE) : LINE = LINE.strip(" ") if EVALUE(LINE) <= P.ETHRESHOLD : HITS.append(LINE) LINE = FILE.readline() else : FINISHED = 1 FILE.close() return # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - class Identifiers: """ For a given database, a list of IDs from hit lines """ def __init__(self): """ Initializes arguments: db="" list=[] ofn="" """ self.db = "" self.list = [] self.ofn = "" def writeFile(self, ONAME): """ @param ONAME: Name of the output file @type ONAME:str A lot of thought went into the issue of whether or not to automatically append a .csv or .tsv file extension. While that might be convenient for opening files in BioLegato (which requires a .csv or .tsv extension), it creates problems if someone wants to click on a file in a file manager. The results are unpredictable, although generally, files with that extension would open in the default spreadsheet program. This introduces lots of potential problems if the user tries to save a file from the spreadsheet. Some programs will enclose all fields in double quotes. Some may default to other formats that will render the file unreadable by BioLegat later. The final decision is to NOT automatically add an extension. """ #self.ofn = ONAME + '.' + self.db self.ofn = ONAME + ".acc" OUTFILE = open(self.ofn, 'w') for J in self.list: #NCBI accession numbers have the form XXXXXXXX.Y, where Y is a numerical # version number. Before writing to output, we get rid of the version field. TOKENS= J.split(".") OUTFILE.write(TOKENS[0] + "\n") OUTFILE.close() return # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - def PARSEBLAST(P, HITS, ID): """ Parse database identifiers from BLAST hit lines """ # Parse the first hit line to find the field containing the # ACCESSION number. At this writing, BLAST hits still include # GI numbers, so we need to skip those. GI numbers will be identifiers # containing ONLY numerals 0 - 9, and no other characters. # eg. gi|75170827|Q9FIG7.1 RecName: Full=Dirigent protein 2; Short=AtD... TOKENS = HITS[0].split("|") ACCFIELD = 1 if re.match('[0-9]+',TOKENS[1].strip()) : ACCFIELD = 2 # ACCFIELD is the index of ACCESSION field. Add the Accession to the list, # and ignore the GI number, if any. # GI numbers are being phased out of GenBank in September 2016. This code should # be compliant with that change, but backward compatible with old NCBI databases. for J in HITS: TOKENS = J.split("|") ACCESSION = TOKENS[ACCFIELD].strip() ID.list.append(ACCESSION) return # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - def PARSEFASTA(HITS, ID): """ Parse database identifiers from FASTA hit lines """ ID.db = 'nam' for J in HITS: TOKENS = J.split(" ") ID.list.append(TOKENS[0]) return # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - def PARSEGENPEPT(P, HITS, ID): """ Parse database identifiers from GENPEPT hit lines """ # Parse the first hit line to find out how many sets of # identifiers there are TOKENS = HITS[0].split(" ") P.TWOIDS = 0 P.TWOIDS = re.match('^[^>][\w]*_[1-6] [\w]* ', HITS[0]) ID.db = 'gp' # Now, process the entire set of hits. Extract identifiers # from each hit line and add them to the ID list(s). for J in HITS: TOKENS = J.split(" ") ID.list.append(TOKENS[0]) return # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - def RUNTEXTEDIT(OFN): """ Run the texteditor in the background and remove the temporary file when done """ COMMAND = '(nohup `choose_edit_wrapper.sh` ' + OFN + '; $RM_CMD ' + OFN + ' > /dev/null)&' # It's surprising how many issues there are with launching multiple # files in a text editor. choose_edit_wrapper.sh takes care of # these issues. #COMMAND = '($BL_TextEditor ' + OFN + '; $RM_CMD ' + OFN + ')&' os.system(COMMAND) return # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - def RunBioLegato(DESTINATION, IDFILE): """ Run the blnfetch or blpfetch in the background and remove the temporary file when done """ COMMAND = '(nohup ' + DESTINATION + ' ' + IDFILE + '; rm -f ' + IDFILE + ' > /dev/null)&' os.system(COMMAND) return #======================== MAIN PROCEDURE ========================== def main(): """ Called when not in documentation mode. """ P = Parameters () # Read hit lines from FASTA,BLAST or GENPEPT output file HITS = [] READHITS(P, HITS) # Parse out the database identifiers from hit lines. # BLAST and GENPEPT hit lines may have two lists of identifiers. # FASTA hit lines have one. ID = Identifiers () P.PID = str(os.getpid()) if P.FILETYPE == 'BLAST': PARSEBLAST(P, HITS, ID) elif P.FILETYPE == 'FASTA': PARSEFASTA(HITS, ID) elif P.FILETYPE == 'GENPEPT': PARSEGENPEPT(P, HITS, ID) # Write the output to a file, or send it to a window, as specified # in -d destination if P.DESTINATION == 'textedit': TEMPOFN = P.PID + '.' + 'outfile' shutil.copy(P.IFN, TEMPOFN) RUNTEXTEDIT(TEMPOFN) ID.writeFile(P.PID) RUNTEXTEDIT(ID.ofn) elif P.DESTINATION == 'files': shutil.copy(P.IFN, P.OFN) ID.writeFile(P.OFN) elif ( P.DESTINATION in ['blnfetch','blpfetch'] ): TEMPOFN = P.PID + '.' + 'outfile' shutil.copy(P.IFN, TEMPOFN) RUNTEXTEDIT(TEMPOFN) ID.writeFile(P.PID) RunBioLegato(P.DESTINATION,ID.ofn) BM.exit_success() if (BM.documentor() or "-test" in sys.argv): pass else: main()