#!/usr/bin/env python3

import os
#import os.path
import subprocess
import sys
import re
import shutil
from Bio import Entrez

#optparse is deprecated in favor of argparse as of Python 2.7. However,
# since 2.7 is not always present on many systems, at this writing,
# it is safer to stick with optparse for now. It should be easy
# to change later, since the syntax is very similar between argparse and optparse.
from optparse import OptionParser

'''
taxfetch.py - Given a set of GenBank LOCUS or ACCESSION numbers , create a file containing
the corresponding NCBI taxonomy entries. If the input file is a csv/tsv file, only the first
column is read. Lines beginning with '#' are ignored as comments

Synopsis: taxfetch.py --infile infile --db database --tablefile tablefilename [--sep separator]

  --infile containing LOCUS or ACCESSION numbers
  --db - NCBI database as defined for EUtilities
  --tablefile for use by forrester decorator program
  --sep - separator character to use when input is a csv file eg tab, comma
  
@modified: July 6, 2020
@author: Brian Fristensky
@contact: frist@cc.umanitoba.ca  
'''


blib = os.environ.get("BIRCHPYLIB")
sys.path.append(blib)

from birchlib import Birchmod
from birchlib import Argument

PROGRAM = "taxfetch.py : "
USAGE = "\n\tUSAGE: taxfetch.py --infile infile --db database [--tablefile tablefile] [--sep separator]"

DEBUG = True
if DEBUG :
    print('Debugging mode on')

BM = Birchmod(PROGRAM, USAGE)

# To use an Entrez API key and email address. These variables can be set once for the Entrez class,
# and will be used for all EUtil calls.
Entrez.api_key = os.getenv("NCBI_ENTREZ_KEY", default=None)
Entrez.email = os.getenv("BL_EMAIL", default=None)
if DEBUG :
    print("Entrez.api_key: " + str(Entrez.api_key))
    print('Entrez.email: ' + str(Entrez.email))

class Parameters:
    """
      	Wrapper class for command line parameters
      	"""
    def __init__(self):
        """
     	  Initializes arguments:
     		IFN=""
            DATABASE = "nuccore"
            SEPARATOR = "\t"
     		TFN=""
     		PID=""

     	  Then calls read_args() to fill in their values from command line
          """
        self.IFN = ""
        self.DATABASE = "taxonomy"
        self.TFN = ""
        self.SEPARATOR = ","
        self.PID = str(os.getpid())
        self.read_args()

    def read_args(self):
        """
        	Read command line arguments into a Parameter object
        	"""

        parser = OptionParser()
        parser.add_option("--infile", dest="ifn", action="store", type="string", default="",
                          help="file of LOCUS or ACCESSION numbers")
        parser.add_option("--db", dest="database", action="store", type="string", default="nuccore",
                          help="database to search")
        parser.add_option("--tablefile", dest="tfn", action="store", type="string", default="",
                          help="output in table format for forrester decorator program")
        parser.add_option("--sep", dest="seperator", action="store", type="string", default="\t",
                          help="filter for query")

        (options, args) = parser.parse_args() 

        self.IFN = options.ifn
        self.DATABASE = options.database
        self.TFN = options.tfn
        self.SEPERATOR = options.seperator

        if DEBUG :
            print('------------ Parameters from command line ------' + '\n')
            print('    INFILE: ' + self.IFN)
            print('    DATABASE: ' + self.DATABASE) 
            print('    TABLEFILE: ' + self.TFN)   
            print('    SEPERATOR: ' + self.SEPERATOR)


# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
def GetIDlist(IFN,SEPERATOR):
    """ 
    return a non-redundant list of IDs from the input file.
    """
    h_INFILE  = open(IFN, "r")
    UIDlist = []
    IDType='uid'
    SPATTERN=re.compile('[A-Z]',re.IGNORECASE)
    for line in h_INFILE :
        if line[0] != '#' :
            rawline = line.strip()
            TOKENS = rawline.split(SEPERATOR)
            if not TOKENS[0] in UIDlist : # eliminate duplicate ID numbers
               if not TOKENS[0] == "" :
                   UIDstr = UIDlist.append(TOKENS[0])
            if re.search(SPATTERN,TOKENS[0]) :
               IDType='accn'
    h_INFILE.close()
    return UIDlist

# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
def Acc2Taxid(ID,DATABASE) :
    """ 
    Given an ACCESSION number or LOCUS name, return the corresponding taxid.
    """

    # We can't be certain whether the ID is an Accession number of a Locus name.
    # Links to the Taxonomy databse require an ACCESSION number or a UID, so we need
    # to run esearch to get a UID that will be correct, regardless of whether
    # ID is a LOCUS or ACCESSION number.
    EQUERY = ID + " [ACCN]"
    handle = Entrez.esearch(db=DATABASE, term=EQUERY)
    result = Entrez.read(handle) 
    
    UID = result["IdList"][0] #UID is NCBI's numerical ID of the object returned

    # Run elink to retrieve the entries
    taxid=""
    linkref = Entrez.elink(id=UID, dbfrom=DATABASE, db="taxonomy", retmode="xml")
    taxlink = Entrez.read(linkref)
    taxid=taxlink[0]["LinkSetDb"][0]["Link"][0]["Id"]
    print("ID: " + ID + "    " + "TAXID: " + taxid)

    return taxid

# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
def GetTaxdata(taxid) :
    """ 
    Given a taxonomy id, return the taxonomy data as an XML object
    """
    thandle = Entrez.efetch(db="taxonomy", id=taxid, retmode="xml")
    taxdata = Entrez.read(thandle)

    return taxdata

# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
def XMLtoPhylo(ID,taxdata,tfile):
    """ 
    Write out XML objects to table for use with the forester decorate program.
 """

    NL="\n"
    TAB="\t"
    # Taxonomy info
    tfile.write(ID)
    tfile.write(TAB + "TAXONOMY_CODE:" + taxdata[0]["TaxId"])
    tfile.write(TAB + "TAXONOMY_ID:" + taxdata[0]["TaxId"])
    tfile.write(TAB + "TAXONOMY_SN:" + taxdata[0]["ScientificName"])
    #tfile.write(TAB + "TAXONOMY_CN:" + taxdata[0]["CommonName"][0])

    # Sequence info
    #tfile.write(TAB + ID)
    tfile.write(TAB + "SEQ_ACCESSION:" + ID)
    # Warning! The last item in the list must not have a terminal TAB
    tfile.write(NL)

#======================== MAIN PROCEDURE ==========================
def main():
    """
        Called when not in documentation mode.

        Unfortunately, it is necessary to post an Entrez request for
        each UID given. The reason is that batch requests retrun taxonomy
        XML objects that don't tell you the corresponding sequence ID
        In that case, there is no way to tell which UID corresponds to a
        particular TaxID. This makes the process slow.

        """
    # Read parameters from command line
    P = Parameters()

    IDlist = GetIDlist(P.IFN,P.SEPERATOR)

    if P.TFN != "" :
        tfile = open(P.TFN,"w")
    else :
        tfile = None

    if tfile != None :
        for ID in IDlist :

            # Retrieve entries from NCBI
            taxid = Acc2Taxid(ID,P.DATABASE)
            taxdata = GetTaxdata(taxid)

            # Write taxonomy data to a table file to be used with 
            # decorate -table   from the forester package.
            if tfile != None :
                XMLtoPhylo(ID,taxdata,tfile)
    else :
        print("taxfetch.py: >>> --tablefile must be specified on the command line.<<<")

    if tfile != None :
        tfile.close()   
   
    BM.exit_success()

if (BM.documentor() or "-test" in sys.argv):
    pass
else:
    main()