#!/usr/bin/env python3 import os #import os.path import subprocess import sys import re import shutil from Bio import Entrez #optparse is deprecated in favor of argparse as of Python 2.7. However, # since 2.7 is not always present on many systems, at this writing, # it is safer to stick with optparse for now. It should be easy # to change later, since the syntax is very similar between argparse and optparse. from optparse import OptionParser ''' taxfetch.py - Given a set of GenBank LOCUS or ACCESSION numbers , create a file containing the corresponding NCBI taxonomy entries. If the input file is a csv/tsv file, only the first column is read. Lines beginning with '#' are ignored as comments Synopsis: taxfetch.py --infile infile --db database --tablefile tablefilename [--sep separator] --infile containing LOCUS or ACCESSION numbers --db - NCBI database as defined for EUtilities --tablefile for use by forrester decorator program --sep - separator character to use when input is a csv file eg tab, comma @modified: July 6, 2020 @author: Brian Fristensky @contact: frist@cc.umanitoba.ca ''' blib = os.environ.get("BIRCHPYLIB") sys.path.append(blib) from birchlib import Birchmod from birchlib import Argument PROGRAM = "taxfetch.py : " USAGE = "\n\tUSAGE: taxfetch.py --infile infile --db database [--tablefile tablefile] [--sep separator]" DEBUG = True if DEBUG : print('Debugging mode on') BM = Birchmod(PROGRAM, USAGE) # To use an Entrez API key and email address. These variables can be set once for the Entrez class, # and will be used for all EUtil calls. Entrez.api_key = os.getenv("NCBI_ENTREZ_KEY", default=None) Entrez.email = os.getenv("BL_EMAIL", default=None) if DEBUG : print("Entrez.api_key: " + str(Entrez.api_key)) print('Entrez.email: ' + str(Entrez.email)) class Parameters: """ Wrapper class for command line parameters """ def __init__(self): """ Initializes arguments: IFN="" DATABASE = "nuccore" SEPARATOR = "\t" TFN="" PID="" Then calls read_args() to fill in their values from command line """ self.IFN = "" self.DATABASE = "taxonomy" self.TFN = "" self.SEPARATOR = "," self.PID = str(os.getpid()) self.read_args() def read_args(self): """ Read command line arguments into a Parameter object """ parser = OptionParser() parser.add_option("--infile", dest="ifn", action="store", type="string", default="", help="file of LOCUS or ACCESSION numbers") parser.add_option("--db", dest="database", action="store", type="string", default="nuccore", help="database to search") parser.add_option("--tablefile", dest="tfn", action="store", type="string", default="", help="output in table format for forrester decorator program") parser.add_option("--sep", dest="seperator", action="store", type="string", default="\t", help="filter for query") (options, args) = parser.parse_args() self.IFN = options.ifn self.DATABASE = options.database self.TFN = options.tfn self.SEPERATOR = options.seperator if DEBUG : print('------------ Parameters from command line ------' + '\n') print(' INFILE: ' + self.IFN) print(' DATABASE: ' + self.DATABASE) print(' TABLEFILE: ' + self.TFN) print(' SEPERATOR: ' + self.SEPERATOR) # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - def GetIDlist(IFN,SEPERATOR): """ return a non-redundant list of IDs from the input file. """ h_INFILE = open(IFN, "r") UIDlist = [] IDType='uid' SPATTERN=re.compile('[A-Z]',re.IGNORECASE) for line in h_INFILE : if line[0] != '#' : rawline = line.strip() TOKENS = rawline.split(SEPERATOR) if not TOKENS[0] in UIDlist : # eliminate duplicate ID numbers if not TOKENS[0] == "" : UIDstr = UIDlist.append(TOKENS[0]) if re.search(SPATTERN,TOKENS[0]) : IDType='accn' h_INFILE.close() return UIDlist # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - def Acc2Taxid(ID,DATABASE) : """ Given an ACCESSION number or LOCUS name, return the corresponding taxid. """ # We can't be certain whether the ID is an Accession number of a Locus name. # Links to the Taxonomy databse require an ACCESSION number or a UID, so we need # to run esearch to get a UID that will be correct, regardless of whether # ID is a LOCUS or ACCESSION number. EQUERY = ID + " [ACCN]" handle = Entrez.esearch(db=DATABASE, term=EQUERY) result = Entrez.read(handle) UID = result["IdList"][0] #UID is NCBI's numerical ID of the object returned # Run elink to retrieve the entries taxid="" linkref = Entrez.elink(id=UID, dbfrom=DATABASE, db="taxonomy", retmode="xml") taxlink = Entrez.read(linkref) taxid=taxlink[0]["LinkSetDb"][0]["Link"][0]["Id"] print("ID: " + ID + " " + "TAXID: " + taxid) return taxid # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - def GetTaxdata(taxid) : """ Given a taxonomy id, return the taxonomy data as an XML object """ thandle = Entrez.efetch(db="taxonomy", id=taxid, retmode="xml") taxdata = Entrez.read(thandle) return taxdata # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - def XMLtoPhylo(ID,taxdata,tfile): """ Write out XML objects to table for use with the forester decorate program. """ NL="\n" TAB="\t" # Taxonomy info tfile.write(ID) tfile.write(TAB + "TAXONOMY_CODE:" + taxdata[0]["TaxId"]) tfile.write(TAB + "TAXONOMY_ID:" + taxdata[0]["TaxId"]) tfile.write(TAB + "TAXONOMY_SN:" + taxdata[0]["ScientificName"]) #tfile.write(TAB + "TAXONOMY_CN:" + taxdata[0]["CommonName"][0]) # Sequence info #tfile.write(TAB + ID) tfile.write(TAB + "SEQ_ACCESSION:" + ID) # Warning! The last item in the list must not have a terminal TAB tfile.write(NL) #======================== MAIN PROCEDURE ========================== def main(): """ Called when not in documentation mode. Unfortunately, it is necessary to post an Entrez request for each UID given. The reason is that batch requests retrun taxonomy XML objects that don't tell you the corresponding sequence ID In that case, there is no way to tell which UID corresponds to a particular TaxID. This makes the process slow. """ # Read parameters from command line P = Parameters() IDlist = GetIDlist(P.IFN,P.SEPERATOR) if P.TFN != "" : tfile = open(P.TFN,"w") else : tfile = None if tfile != None : for ID in IDlist : # Retrieve entries from NCBI taxid = Acc2Taxid(ID,P.DATABASE) taxdata = GetTaxdata(taxid) # Write taxonomy data to a table file to be used with # decorate -table from the forester package. if tfile != None : XMLtoPhylo(ID,taxdata,tfile) else : print("taxfetch.py: >>> --tablefile must be specified on the command line.<<<") if tfile != None : tfile.close() BM.exit_success() if (BM.documentor() or "-test" in sys.argv): pass else: main()