#!/usr/bin/env python

"""
 dbsout.py - Extract hit lines or ID#'s from BLAST, GENPEPT or FASTA output
            Send output to files or windows.

 Synopsis:
  dbsout.py infile [-e ethreshold] -d destination [outfile]
       IFN - output from BLAST, GENPEPT or FASTA
       -e     - only select output for which the ETHRESHOLD is
                 <= ethreshold
       -d     - destination: one of the folloeing:
                 textedit - open output files in text editor
                   specified by the $BL_TextEditor environment variable
                files - write to files, using the basename
                 specified by destination.
                blnfetch - BioLegato interface for nucleic acid GI numbers
                blpfetch - BioLegato interface for amino acid GI numbers

       outfile - basename for outputfile(s)

@modified: October 6, 2016
@author: Brian Fristensky
@contact: frist@cc.umanitoba.ca
"""
import sys
import os
import re
import shutil
import sys

blib = os.environ.get("BIRCHPYLIB")
sys.path.append(blib)

from birchlib import Birchmod
from birchlib import Argument

PROGRAM = "dbsout.py : "
USAGE = "\n\tUSAGE: dbsout.py infile [-e ethreshold] -d destination [outfile]"

BM = Birchmod(PROGRAM, USAGE)


class Parameters:
    """
      	Wrapper class for command line parameters
      	By default, ETHRESHOLD is set to 10000, so that all
      	hits will be returned, if -e is not set at the command line
      	"""
    def __init__(self):
        """
     	  Initializes arguments:
     		IFN=""
     		ETHRESHOLD=float(10000)
     		DESTINATION=""
     		OFN=""
     		PID=""
     		TWOIDS=""
     	  Then calls read_args() to fill in their values from command line
          """
        self.IFN = ""
        self.ETHRESHOLD = float(10000)
        self.DESTINATION = ""
        self.OFN = ""
        self.PID = ""
        self.TWOIDS = ""
        self.read_args()


    def read_args(self):
        """
        	Read command line arguments into a Parameter object

        	"""

        infile = Argument("", str, BM)
        ethresh = Argument("-e", float, BM)
        dest = Argument("-d", str, BM)
        outfile = Argument("", str, BM)

        infile.set_position(1)
        outfile.set_position(len(sys.argv) - 1)
        ethresh.set_optional()
	outfile.set_optional()
	
        try:
            self.IFN = infile.fetch()
            self.OFN = outfile.fetch()
            self.DESTINATION = dest.fetch()
	
            ethresh = ethresh.fetch()
	
            if(ethresh != None):
                self.ETHRESHOLD = ethresh
	
        except:
            BM.printusage()
	

# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

def READHITS(P, HITS):
    """
	Read hit lines from FASTA or BLAST output file
	"""
	
    def isBLAST(LINE):
        """
		return true if line is a BLAST-style hit eg.
		gi|169080|gb|AAA33662.1| disease resistance respo  ( 175) 1163 295.7 5.7e-79
		"""
        RESULT = 'false'
        RESULT = re.match('^[\w]{2,}\|\S*\|', LINE)
        return RESULT

    def isFASTA(LINE):
        """
		return true if line is a FASTA-style hit eg.
		AF141131 - Helianthus annuus cultivar Line HA  ( 439) [3]  564 146.8 9.3e-35   
		"""
        RESULT = 'false'
        RESULT = re.match('^[^>][\w]* - *', LINE)
        return RESULT

    def isGENPEPT(LINE):
        """
		return true if line is a GENPEPT-style hit eg.
                J02593_1 SRAAFP 213874  Sea raven (Hemitripter ( 195) [f] 1369 310.6 3.5e-82   
		"""
        RESULT = 'false'
        RESULT = re.match('^[^>][\w]*_[1-6] [\w]* ', LINE)
        return RESULT

    def EVALUE(LINE):
        """
		Return the E value from a hit line
		"""
        TOKENS = LINE.split(" ")
    # For floating point numbers <= 1e-100, BLAST
    # truncates the evalue to something line 'e-100'
    # This will cause an error when dbsout.py tries
    # to convert to a floating point number.
    # First, we try concatenating a '1' to the beginning
    # of the string. If that still produces an exception,
    # we assume the E Value is 0. The worst that can
    # happen is that we show an extra hit, and a bad
    # E value should be seen upon inspection of the output.
        ESTR = TOKENS[len(TOKENS) - 1]
        if ESTR[0] == 'e':
            ESTR = '1' + ESTR
        try:
            E = float(ESTR)
        except  ValueError:
            E = 0
        return E
	
    try:
        FILE = open(P.IFN, 'r')
        P.FILETYPE = ""
    except:
        BM.file_error(P.IFN)
	    
    #Find the first hit line, in either BLAST or FASTA format
    LINE = FILE.readline()
    FOUND = 0
    while not (LINE == '' or FOUND) :
          if isBLAST(LINE) :
             P.FILETYPE = 'BLAST'
             FOUND = 1
	  elif isFASTA(LINE) :
             P.FILETYPE = 'FASTA'
             FOUND = 1
	  elif isGENPEPT(LINE) :
             P.FILETYPE = 'GENPEPT'
             FOUND = 1
	  else:	     
	     LINE = FILE.readline()
	  
    # Keep reading lines until a non-hit line or end of file
    FINISHED = 0
    if P.FILETYPE == 'BLAST' :    
       while not (LINE == '' or FINISHED):
             if isBLAST(LINE) :
	        LINE = LINE.strip(" ")
        	if EVALUE(LINE) <= P.ETHRESHOLD :
		   HITS.append(LINE)
	        LINE = FILE.readline()
             else :
	        FINISHED = 1
    elif P.FILETYPE == 'FASTA' :
       while not (LINE == '' or FINISHED):
             if isFASTA(LINE) :
	     	LINE = LINE.strip(" ")
	        if EVALUE(LINE) <= P.ETHRESHOLD :
        	   HITS.append(LINE)
	        LINE = FILE.readline() 
             else :
	        FINISHED = 1   
    elif P.FILETYPE == 'GENPEPT' :
       while not (LINE == '' or FINISHED):
             if isGENPEPT(LINE) :
	     	LINE = LINE.strip(" ")
	        if EVALUE(LINE) <= P.ETHRESHOLD :
        	   HITS.append(LINE)
	        LINE = FILE.readline() 
             else :
	        FINISHED = 1 	      
    FILE.close()
    return
	
    # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
class Identifiers:
    """
      For a given database, a list of IDs from hit lines
      """
    def __init__(self):
        """
          Initializes arguments:
     		db=""
     		list=[]
     		ofn=""
          """
        self.db = ""
        self.list = []
        self.ofn = ""

    def writeFile(self, ONAME):
        """
          @param ONAME: Name of the output file
          @type ONAME:str
          
          A lot of thought went into the issue of whether or not 
          to automatically append a .csv or .tsv file extension.
          While that might be convenient for opening files in BioLegato
          (which requires a .csv or .tsv extension), it creates problems
          if someone wants to click on a file in a file manager. The results
          are unpredictable, although generally, files with that extension would
          open in the default spreadsheet program. This introduces lots of potential
          problems if the user tries to save a file from the spreadsheet. Some
          programs will enclose all fields in double quotes. Some may default
          to other formats that will render the file unreadable by BioLegat later.
          The final decision is to NOT automatically add an extension.
          """
        #self.ofn = ONAME + '.' + self.db
        self.ofn = ONAME + ".acc"
        OUTFILE = open(self.ofn, 'w')
        for J in self.list:
            #NCBI accession numbers have the form XXXXXXXX.Y, where Y is a numerical
            # version number. Before writing to output, we get rid of the version field.
            TOKENS= J.split(".")
            OUTFILE.write(TOKENS[0] + "\n")
        OUTFILE.close()
        return


# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
def PARSEBLAST(P, HITS, ID):
    """	
    Parse database identifiers from BLAST hit lines
    """
    # Parse the first hit line to find the field containing the
    # ACCESSION number. At this writing, BLAST hits still include
    # GI numbers, so we need to skip those. GI numbers will be identifiers
    # containing ONLY numerals 0 - 9, and no other characters.
    # eg. gi|75170827|Q9FIG7.1  RecName: Full=Dirigent protein 2; Short=AtD...
    TOKENS = HITS[0].split("|")
    ACCFIELD = 1
    if re.match('[0-9]+',TOKENS[1].strip()) :
        ACCFIELD = 2

    # ACCFIELD is the index of ACCESSION field. Add the Accession to the list,
    # and ignore the GI number, if any.
    # GI numbers are being phased out of GenBank in September 2016. This code should
    # be compliant with that change, but backward compatible with old NCBI databases.    
    for J in HITS:
        TOKENS = J.split("|")
        ACCESSION = TOKENS[ACCFIELD].strip()
        ID.list.append(ACCESSION)
	           	
    return

# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
def PARSEFASTA(HITS, ID):
    """
    Parse database identifiers from FASTA hit lines
    """
    ID.db = 'nam'
    for J in HITS:
        TOKENS = J.split(" ")
	ID.list.append(TOKENS[0])           	
    return

# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
def PARSEGENPEPT(P, HITS, ID):
    """	
    Parse database identifiers from GENPEPT hit lines
    """
    # Parse the first hit line to find out how many sets of
    # identifiers there are
    TOKENS = HITS[0].split(" ")
    P.TWOIDS = 0
    P.TWOIDS = re.match('^[^>][\w]*_[1-6] [\w]* ', HITS[0])
    ID.db = 'gp'

    # Now, process the entire set of hits. Extract identifiers
    # from each hit line and add them to the ID list(s).          
    for J in HITS:
        TOKENS = J.split(" ")
	ID.list.append(TOKENS[0])
	           	
    return

# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
def RUNTEXTEDIT(OFN):
    """
    Run the texteditor in the background and
    remove the temporary file when done
    """
    COMMAND = '(nohup `choose_edit_wrapper.sh` ' + OFN + '; $RM_CMD ' + OFN + ' > /dev/null)&'
    # It's surprising how many issues there are with launching multiple
    # files in a text editor. choose_edit_wrapper.sh takes care of
    # these issues.
    #COMMAND = '($BL_TextEditor ' + OFN + '; $RM_CMD ' + OFN + ')&'
    os.system(COMMAND)           	
    return

# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
def RunBioLegato(DESTINATION, IDFILE):
    """
    Run the blnfetch or blpfetch in the background and 
    remove the temporary file when done
    """
    COMMAND = '(nohup ' + DESTINATION + ' ' +  IDFILE + '; rm -f ' + IDFILE + ' > /dev/null)&'
    os.system(COMMAND)
           	
    return

           
#======================== MAIN PROCEDURE ==========================
def main():
    """
        Called when not in documentation mode.
        """
	
    P = Parameters ()
	
	
    # Read hit lines from FASTA,BLAST or GENPEPT output file
    HITS = []
    READHITS(P, HITS)
	
    # Parse out the database identifiers from hit lines.
    # BLAST and GENPEPT hit lines may have two lists of identifiers.
    # FASTA hit lines have one.
    ID = Identifiers ()
	
    P.PID = str(os.getpid())
    if P.FILETYPE == 'BLAST':
        PARSEBLAST(P, HITS, ID)
    elif P.FILETYPE == 'FASTA':
        PARSEFASTA(HITS, ID)
    elif P.FILETYPE == 'GENPEPT':
        PARSEGENPEPT(P, HITS, ID)
	
    # Write the output to a file, or send it to a window, as specified
    # in -d destination
	
    if P.DESTINATION == 'textedit':
        TEMPOFN = P.PID + '.' + 'outfile'
        shutil.copy(P.IFN, TEMPOFN)
        RUNTEXTEDIT(TEMPOFN)
        ID.writeFile(P.PID)
        RUNTEXTEDIT(ID.ofn)
	       
    elif P.DESTINATION == 'files':
        shutil.copy(P.IFN, P.OFN)
        ID.writeFile(P.OFN)
	    
    elif ( P.DESTINATION in ['blnfetch','blpfetch'] ):
        TEMPOFN = P.PID + '.' + 'outfile'
        shutil.copy(P.IFN, TEMPOFN)
        RUNTEXTEDIT(TEMPOFN)
        ID.writeFile(P.PID)
        RunBioLegato(P.DESTINATION,ID.ofn)            
	   
    BM.exit_success()

if (BM.documentor() or "-test" in sys.argv):
    pass
else:
    main()