#!/usr/bin/env python import birchenv import birchscript import os import os.path import subprocess import sys import re import shutil from Bio import Entrez #optparse is deprecated in favor of argparse as of Python 2.7. However, # since 2.7 is not always present on many systems, at this writing, # it is safer to stick with optparse for now. It should be easy # to change later, since the syntax is very similar between argparse and optparse. from optparse import OptionParser ''' seqfetch.py - Given a set of GI (UID) or ACCESSION numbers , create a file containing the corresponding genbank entries. If the input file is a csv file, only the first column is read. Lines beginning with '#' are ignored as comments Synopsis: seqfetch.py --infile infile --outfile outfile [--format sequence_format] [--query entrez_query_statement] [--db database] [--sep separator] --infile containing GI (UID) or ACCESSION numbers --outfile to contain all of the GenBank entries --query entrez_query_statement - limit sequences retrieved using an Entrez query --format sequence_format - sequence format as defined by NCBI EUtilities. --db - NCBI database as defined for EUtilities --sep - separator character to use when input is a csv file eg tab, comma @modified: May 8, 2020 @author: Brian Fristensky @contact: frist@cc.umanitoba.ca ''' blib = os.environ.get("BIRCHPYLIB") sys.path.append(blib) from birchlib import Birchmod from birchlib import Argument PROGRAM = "seqfetch.py : " USAGE = "\n\tUSAGE: seqfetch.py --infile infile --outfile outfile [--format sequence_format] [--query entrez_query_statement] [--db database] [--sep separator]" DEBUG = True if DEBUG : print('Debugging mode on') BM = Birchmod(PROGRAM, USAGE) EMAILADDR = BM.getEmailAddr() if (EMAILADDR != "") : Entrez.email = EMAILADDR if DEBUG : print('EMAILADDR: ' + EMAILADDR) class Parameters: """ Wrapper class for command line parameters """ def __init__(self): """ Initializes arguments: IFN="" EQUERY="" FORMAT="gb" DATABASE = "nuccore" SEPARATOR = "," OFN="" PID="" Then calls read_args() to fill in their values from command line """ self.IFN = "" self.EQUERY = "" self.FORMAT = "gb" self.DATABASE = "nuccore" self.SEPARATOR = "," self.OFN = "" self.PID = str(os.getpid()) self.read_args() def read_args(self): """ Read command line arguments into a Parameter object """ parser = OptionParser() parser.add_option("--infile", dest="ifn", action="store", type="string", default="", help="file of GI or Accession numbers") parser.add_option("--outfile", dest="ofn", action="store", type="string", default="", help="output file") parser.add_option("--format", dest="format", action="store", type="string", default="gb", help="format for output file") parser.add_option("--query", dest="query", action="store", type="string", default="", help="ncbiquery query term") parser.add_option("--db", dest="database", action="store", type="string", default="nuccore", help="database to search") parser.add_option("--sep", dest="seperator", action="store", type="string", default=",", help="filter for query") (options, args) = parser.parse_args() self.IFN = options.ifn self.OFN = options.ofn self.FORMAT = options.format self.EQUERY = options.query self.DATABASE = options.database self.SEPERATOR = options.seperator if DEBUG : print('------------ Parameters from command line ------' + '\n') print(' INFILE: ' + self.IFN) print(' OUTFILE: ' + self.OFN) print(' FORMAT: ' + self.FORMAT) print(' EQUERY: ' + self.EQUERY) print(' DATABASE: ' + self.DATABASE) print(' SEPERATOR: ' + self.SEPERATOR) # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - def Retrieve(IFN,OFN,JOBID,FORMAT,EQUERY,DATABASE,SEPARATOR): """ """ #If input file contains characters other than [0-9], we assume #that the input is ACCESSION numbers. By default, we assume #the input is GI numbers. # Preprocessing - strip out comment lines beginning with '#' and # write to a new file only column 1, assuming that the delimiter is a tab. h_INFILE = open(IFN, "r") # UIDstr is a string containing a comma-separated list of UIDs or Accessions. # We need to make this more sophisticated to break up lists greater than # 500 UIDs into smaller lists so that we can do mulitple queries, rather than # one really big query. UIDstr="" IDType='uid' SPATTERN=re.compile('[A-Z]',re.IGNORECASE) for line in h_INFILE : if line[0] != '#' : rawline = line.strip() #print(rawline) TOKENS = rawline.split(SEPARATOR) if (UIDstr == "") : UIDstr=TOKENS[0] elif not TOKENS[0] in UIDstr : # eliminate duplicate ID numbers UIDstr = UIDstr + ',' + TOKENS[0] if re.search(SPATTERN,TOKENS[0]) : IDType='accn' h_INFILE.close() if DEBUG : print('UIDstr: ' + UIDstr) print('IDType: ' + IDType) # If the ID list is GI/uid numbers, a query statement will be evaluated to apply to the # ID list. This option is NOT available if the ID list is ACCESSION numbers. # Run epost to generate a history file at NCBI, containing links to the # specified entries. if IDType=='uid' : handle = Entrez.epost(db=DATABASE, id=UIDstr, usehistory="y") result = Entrez.read(handle) WEBENV = result["WebEnv"] QUERYKEY = result["QueryKey"] if DEBUG : print("WebEnv: " + WEBENV) print("query_key: " + QUERYKEY) if (EQUERY != "" ): # Run efilter apply an Entrez query term to the list handle = Entrez.esearch(db=DATABASE, term=EQUERY, usehistory="y", WebEnv=WEBENV, query_key=QUERYKEY) result = Entrez.read(handle) WEBENV = result["WebEnv"] QUERYKEY = result["QueryKey"] if DEBUG : print("WebEnv: " + WEBENV) print("query_key: " + QUERYKEY) # Run efetch to retrieve the entries h_OUTFILE = open(OFN, "w") # handle = Entrez.efetch(id=UIDstr, db=DATABASE,query_key=QUERYKEY, WebEnv=WEBENV, usehistory="y", retmode="text", rettype=FORMAT) handle = Entrez.efetch(db=DATABASE,query_key=QUERYKEY, WebEnv=WEBENV, usehistory="y", retmode="text", rettype=FORMAT) else: h_OUTFILE = open(OFN, "w") handle = Entrez.efetch(id=UIDstr, db=DATABASE, retmode="text", rettype=FORMAT) for line in handle: h_OUTFILE.write(line) #print line handle.close() h_OUTFILE.close() #======================== MAIN PROCEDURE ========================== def main(): """ Called when not in documentation mode. """ # Read parameters from command line P = Parameters() # Retrieve entries from NCBI to outfile Retrieve(P.IFN,P.OFN,P.PID,P.FORMAT,P.EQUERY,P.DATABASE,P.SEPARATOR) BM.exit_success() if (BM.documentor() or "-test" in sys.argv): pass else: main()