#!/usr/bin/env python import birchscript import re import os import os.path import shutil import subprocess import sys # FETCH - a c-shell script (version 24 Feb 07) # FETCH retrieves database entries keyed on either accession numbers # or GenBank LOCUS names # 24 Feb 2007 Beginning with GenBank Release 158.0, the accession # number index is broken into several files, gbacc1.idx, # gbacc2.idx etc. FETCH has been updated to find all of # these files and search each of them. # 25 Jan 2002 Previously, getloc would use the .gen file extension for # PIR files. getloc now creates PIR files with the .pir extension. # FETCH has been updated to also use the .pir extension. # Also, support for VECBASE has been removed. # 25 Aug 2000 revised to accommodate the new tab-delimited format # for gbacc.idx, starting with Release 119.0 # Also, no longer supports the discontinued RNA division # of GenBank. # DOES NOT YET SEARCH THE NEW SECONDARY ACC. # FILE gbsec.idx ##################################################################### # set default parameters #################################################################### # It is assumed that the following environment variables have been set # by .cshrc or .login: # GB - GenBank directory # PIR - PIR/NBRF directory # Running fetch on a remote host # If your databases are on a different host, you can still run fetch # locally, and it will use rsh to run a fetch job on the remote host. # To facilitate remote fetching, you must set the environment variable # XYLEM_RHOST to the name of the remote host, and XYLEM_USERID to your # userid on that remote host (should be done in .cshrc) name = "" namefile = "" outfile = "" whattoget = "b" database = "g" dbname = "" files = "f" interactive = True getloc_options = "" #string to hold command line options for getloc jobid = os.getpid() forcelocal = False # forcelocal - prevents remote execution as # specified by XYLEM_RHOST. NICE = 8 namefile_lines = [] tempdir = "FETCHDIR." + jobid STARTDIR = os.getcwd() # Platform-specific syntax is chosen based on XYLEM_PLATFORM. # (default = sun) if os.environ.has_key("XYLEM_PLATFORM"): XYLEM_PLATFORM = os.environ.get("XYLEM_PLATFORM") else: XYLEM_PLATFORM = "sun" #Sun4, Sparcstations if XYLEM_PLATFORM == "sun": RM = "/usr/bin/rm -f" else: RM = "rm -f" ##################################################################### # Read options from the command line and set parameters, # or prompt user for parameters at terminal. #################################################################### if len(sys.argv) != 0: #--------------------------------------------------- #parameters given in command line files = "" interactive = False index = 1 while index <= len(sys.argv): arg_test = sys.argv[index] if arg_test in ("-a", "-s", "-b", "-g", "-e", "-p"): database = arg_test[1] getloc_options += " " + arg_test elif arg_test == "-f": files = "f" getloc_options += " " + arg_test outfile = "" elif arg_test == "-G": database = "G" index += 1 dbname = arg_test elif arg_test == "-P": database = "P" index += 1 dbname = arg_test getloc_options += " -p" else: if namefile == "": namefile = arg_test else: outfile = arg_test files = "" index += 1 if namefile == "": print 'No namefile specified' exit() elif files == "" and outfile == "": print 'No outfile specified' exit() #strip comments and blank lines out of $namefile # possibly use namefile_lines instead of writing to file #egrep -v -e \; $namefile |egrep -v -e '^ *$' > NAMEFILE.$jobid namefile_lines = [] h_namefile = open(namefile, "r") for line in h_namefile: if line.rstrip() != "" and not re.search(";", line): namefile_lines += line h_namefile.close() else: #--------------------------------------------------------------- # Interactive parameter input complete = False while not complete: #Display current parameter settings print '___________________________________________________________________' print ' FETCH - Version 24 Feb 07 ' print ' Please cite: Fristensky (1993) Nucl. Acids Res. 21:5997-6003' print '___________________________________________________________________' print 'Namefile: ' + namefile print 'Outfile: ' + outfile print 'Database: ' + dbname print '-------------------------------------------------------------------' print ' Parameter Description Value' print '' print '1) Name/Acc Name or Accession sequence to get ' + name print '2) Namefile Get list of sequences from Namefile' print '3) WhatToGet a:annotation s:sequence b:both ' + whattoget print '4) Database g:GenBank p:PIR ' + database print ' G:GenBank dataset P:PIR dataset' print '5) Outfile Send all output to a single file (Outfile)' print '6) Files f:Send each entry to a separate file ' + files print ' -------------------------------------------------------------' #Prompt for parameter to change paramnum = int(raw_input(' Type number of your choice or 0 to continue:')) if paramnum == 0: if not os.path.exists("NAMEFILE." + jobid): print ">>> Must specify Name/Acc or Namefile" raw_input(">>> Press RETURN to continue") else: complete = True elif paramnum == 1: name = raw_input('Type Name or Accession number of sequence to get:') h_temp = open("NAMEFILE." + jobid, "w") print >> h_temp, name h_temp.close() namefile = "" elif paramnum == 2: namefile = raw_input('Name of file containing sequence names or accession numbers:') if namefile != "": if os.path.exists(namefile) and os.path.is_file(namefile): #strip comments and blank lines out of $namefile # possibly use namefile_lines instead of writing to file # egrep -v -e \; $namefile |egrep -v -e '^ *$'> NAMEFILE.$jobid namefile_lines = [] h_namefile = open(namefils, "r") for line in h_namefile: if line.rstrip() != "" and not re.search(";", line): namefile_lines += line h_namefile.close() name = "" else: print ">>> " + namefile + " nonexistent or read-protected" print ">>>Enter another filename" namefile = "" elif paramnum == 3: temp = raw_input('Type a for annotation, s for sequence or b for both:') if temp in ("a", "s", "b"): whattoget = temp else: print '>>> Invalid choice' elif paramnum == 4: temp = raw_input('Choose one of gGpP') if temp in ("g", "G", "p", "P"): database = temp if temp in ("G", "P"): dbname = raw_input('Enter filename:') else: print '>>> Invalid choice' elif paramnum == 5: outfile = raw_input('Type name of file for output') files = "" elif paramnum == 6: files = "f" outfile = "" #If parameter chosen is 0, and a minimal set of parameters are #set, terminate the loop (complete=True) #set options if whattoget == "b": if database in ("g", "G"): extension = ".gen" elif database in ("p", "P"): extension = ".pir" getloc_options += " -p" else: getloc_options = "-" + whattoget if whattoget == "a": extension = ".ano" else: extension = ".wrp" if database != "g": getloc_options += " -" + database if files == "f" and interactive: getloc_options += " -f" # start the BIIIIIG LONG if statements if os.environ.has_key("XYLEM_RHOST") and database not in ("G", "P") and forcelocal: ##################################################################### # Run FETCH remotely, if XYLEM_RHOST and XYLEM_USERID are set #################################################################### XYLEM_RHOST = os.environ.get("XYLEM_RHOST") XYLEM_USERID = os.environ.get("XYLEM_USERID") if XYLEM_RHOST == "clever": os.mkdir(tempdir) os.chdir(tempdir) # Retrieve the specified entries using nclever p_nclever = subprocess.Popen(["nclever", "-b"], stdin=subprocess.PIPE) print >> p_nclever.stdin, "database nucleotide" print >> p_nclever.stdin, "report GenBank" for name in namefile_lines: print >> p_nclever.stdin, "accession " + name print >> p_nclever.stdin, "union all" print >> p_nclever.stdin, "save OUTFILE." + jobid + " all" print >> p_nclever.stdin, "quit" p_nclever.stdin.close() p_nclever.wait() # Create a XYLEM dataset if database == "g": if outfile != "": shutil.move("OUTFILE." + jobid, os.path.join(STARTDIR, outfile)) else: os.nice(NICE) subprocess.call(["splitdb", "OUTFILE." + jobid, "OUTFILE.ano", "OUTFILE.wrp", "OUTFILE.ind"]) os.nice(0) subprocess.call(["getloc", "-f", "OUTFILE.ind", "OUTFILE.ano", "OUTFILE.wrp", "OUTFILE.ind"]) os.chdir(STARTDIR) else: # Remote hosts can be chosen by having a script called choosehost # in your bin directory. choosehost returns the name of a remote # host. While one possible implementation of choosehost is provided # with XYLEM, choosehost can be tailored to your particular # configuration of servers. if XYLEM_RHOST == "choosehost": XYLEM_RHOST = birchscript.choosehost() tempname = "TMP_" + jobid remotefn = XYLEM_USERID + "@" + XYLEM_RHOST + ":" + tempname if interactive: print "Copying namefile to $remotefn.nam ..." h_namefile = open("NAMEFILE." + jobid, "w") h_namefile.writelines(namefile_lines) h_namefile.close() subprocess.call(["rcp", "NAMEFILE." + jobid, remotefn + ".nam"]) if interactive: print "Running FETCH remotely on " + XYLEM_RHOST + " as user " + XYLEM_USERID + " ..." subprocess.call(["rsh", XYLEM_RHOST, "-l", XYLEM_USERID, "fetch", "-L", getloc_options, tempname + ".nam", tempname + ".out"]) if outfile == "": outfile = tempname + ".out" if interactive: print "Copying " + remotefn + ".out to " + outfile + " ..." subprocess.call(["rcp", remotefn + ".out", outfile]) # This allows a single name/acc to be specified in 1) without specifying # an outfile name. -f will not work if you use 2) Namefile. if files == "f": if name != "": shutil.move(outfile, name + extension) if interactive: print "Removing temporary files..." subprocess.call(["rsh", XYLEM_RHOST, "-l", XYLEM_USERID, RM, tempname + ".nam", tempname + ".out"]) for file in os.listdir(os.getcwd()): if file.endswith("." + jobid): os.remove(os.path.join(os.getcwd(), file)) else: ##################################################################### # Extract entries from local database #################################################################### #--------------------------- GenBank --------------------- if database in ("g", "G"): if database == "G": # If dataset is not split, split it base = os.path.splitext(dbname)[0] dbextension = os.path.splitext(dbname)[1] if dbextension == ".gen": #GenBank needtosplit = True base = "TMP" + jobid os.nice(NICE) subprocess.call(["splitdb", dbname, base + ".ano", base + ".wrp", base + ".ind"]) os.nice(0) else: needtosplit = False results = 0 #find the accession numbers os.nice(10) h_resultfile = open("FOUND." + jobid, "a") for file in os.listdir(os.path.join(os.getcwd(), GB)): accfile = os.path.basename(file) if accfile.startswith("gbacc") and os.path.splitext(accfile)[1] == ".idx": # To search for only one name, use egrep, otherwise use fgrep if interactive: print "Searching index file " + accfile + " ..." h_accfile = open(accfile, "r") for acc_line in h_accfile: for regex in namefile_lines: # re.I means ignore case # egrep -i -e `cat NAMEFILE.$jobid` $accfile >> PRELIMINARY.$jobid # . . . . . . . . . . . . . . . . . # Do a more careful screening of the preliminary hits, making sure # that each hit appeared as a separate token, and not as part of # a longer token. #for token in namefile_lines: # grep -w -i $token PRELIMINARY.$jobid >> FOUND.$jobid #end #------ # combining the above two screens (NOTE: preliminary.jobid does not appear anywhere # else within this "case" statement #------ if re.search("\b" + regex + "\b", acc_line, re.I): h_resultfile.write(acc_line) results += 1 h_accfile.close() h_resultfile.close() os.nice(0) namefile_lines = None # . . . . . . . . . . . . . . . . . # Retrieve sequences, if any were found. if results == 0: if interactive: print "No matches found in " + GB + os.sep + "gbacc.idx" else: # If final output goes to a single file, # make a temporary directory to store retrieved entries if files != "f": base = os.path.join(os.getcwd(), base) os.mkdir(tempdir) shutil.move("FOUND." + jobid, os.path.join(tempdir, "FOUND." + jobid)) os.chdir(tempdir) if database == "g": #retrieve the entries from each division using getloc divisions = ["bct", "inv", "mam", "phg", "pln", "pri", "rod", "syn", "una", "vrt", "vrl", "est", "pat", "sts", "gss", "htg"] for div in divisions: # egrep -i -e " $div " FOUND.$jobid > TMP.$jobid # cut -f2 -d" " TMP.$jobid |sort| uniq > name.$jobid name_jobid = [] h_found = open("FOUND." + jobid) for line in h_found: if re.search("\t" + div + "\t", line, re.IGNORECASE): tmplist += div if len(name_jobid) > 0: if interactive: print "Retrieving entries from " + os.path.join(GB, "gb" + div) + " ..." #cut out the second tab-separated field, containing # the names of hits, sort it, and write to name.$jobid # tr -s " " " " < TMP.$jobid |cut -f2 -d" " |sort| uniq > name.$jobid #cut -f2 -d" " TMP.$jobid |sort| uniq > name.$jobid name_jobid = list(set(name_jobid)).sort() # >> name.jobid # Most GenBank divisions are present in one file eg. gbinv. # Large GenBank divisions such as EST and Primate are split # eg. gbest1, gbest2, gbest3... # Regardless of how many divisions there are, DIVSET creates # the list of all files for that division. if os.path.exists(os.path.join(GB, "gb" + div + ".ind")): DIVSET = div else: index = 1 DIVSET = [] while os.path.exists(os.path.join(GB, "gb" + div + index + ".ind")): DIVSET.append(div + index) index += 1 # Do a more careful screening of the preliminary hits, making sure # that each hit appeared as a separate token, and not as part of # a longer token. os.nice(10) h_base_jobid = open(base + "." + jobid, "a") for token in name_jobid: for base in DIVSET: #grep -w -i $token $GB/gb$base.ind >> $base.$jobid h_gbbase = open(os.path.join(GB, "gb" + base + ".ind"), "r") for line in h_gbbase: if re.search("\b" + token + "\b", line, re.IGNORECASE): h_gbbase.write(line) h_gbbase.close() h_base_jobid.close() for base in DIVSET: if whattoget == "b": extension = ".gen" subprocess(["getloc", getloc_options, "-f", base + "." + jobid, os.path.join(GB, "gb" + base + ".ano"), os.path.join(GB, "gb" + base + ".wrp"), os.path.join(GB, "gb" + base + ".ind")]) elif whattoget == "a": extension = ".ano" subprocess(["getloc", getloc_options, "-f", base + "." + jobid, os.path.join(GB, "gb" + base + ".ano"), os.path.join(GB, "gb" + base + ".ind")]) elif whattoget == "s": extension = ".wrp" subprocess(["getloc", getloc_options, "-f", base + "." + jobid, os.path.join(GB, "gb" + base + ".wrp"), os.path.join(GB, "gb" + base + ".ind")]) os.nice(0) # . . . . . . . . . . . . . . . . . #If final output goes to a single file, #write the entries to a file in the next highest directory, and #get rid of the temporary directory if files == "f": h_outfile = open(os.path.join(STARTDIR, outfile), "w") for file in os.listdir(os.getcwd()): if os.path.splitext(file)[1] == extension: birchscript.cat_to(file, h_outfile) h_outfile.close() os.chdir(STARTDIR) shutil.rmtree(tempdir) #--------------------------- User-defined GenBank --------------------- elif database == "G": #retrieve the entries using getloc if os.path.exists("FOUND." + jobid): if interactive: print "Retrieving entries from " + dbname + " ..." os.nice(10) if whattoget == "b": extension = ".gen" subprocess.call(["getloc", getloc_options, "-f", "FOUND." + jobid, base + ".ano", base + ".wrp", base + ".ind"]) elif whattoget == "a": extension = ".ano" subprocess.call(["getloc", getloc_options, "-f", "FOUND." + jobid, base + ".ano", base + ".ind"]) elif whattoget == "s": extension = ".wrp" subprocess.call(["getloc", getloc_options, "-f", "FOUND." + jobid, base + ".wrp", base + ".ind"]) os.nice(0) #If final output goes to a single file, #write the entries to a file in the next highest directory, and #get rid of the temporary directory if files != "f": h_outfile = open(os.path.join(STARTDIR, outfile), "w") for file in os.listdir(os.getcwd()): if os.path.splitext(file)[1] == extension: birchscript.cat_to(file, h_outfile) h_outfile.close() os.chdir(STARTDIR) shutil.rmtree(tempdir) if needtosplit: os.remove(base + ".ano") os.remove(base + ".wrp") os.remove(base + ".ind") #---------------------------- PIR/NBRF -------------------- elif database in ("p", "P"): if database == "P": # If dataset is not split, split it base = os.path.splitext(dbname)[0] dbextension = os.path.splitext(dbname)[1] if dbextension == ".pir": #PIR needtosplit = True base = "TMP" + jobid os.nice(NICE) subprocess.call(["splitdb", "-p", dbname, base + ".ano", base + ".wrp", base + ".ind"]) os.nice(0) else: needtosplit = False # If final output goes to a single file, # make a temporary directory to store retrieved entries if files != "f": base = os.path.join(os.getcwd(), base) os.mkdir(tempdir) shutil.move("FOUND." + jobid, os.path.join(tempdir, "FOUND." + jobid)) os.chdir(tempdir) if database == "p": #retrieve entries divisions = ["pir1", "pir2", "pir3", "pir4"] success = False for div in divisions: #find the accession numbers if interactive: print "Searching index file " + os.path.join(PIR, div + ".ind") + " ..." # Do a more careful screening of the preliminary hits, making sure # that each hit appeared as a separate token, and not as part of # a longer token. This time, it's not as simple as for GenBank. # The for loop writes to TRUE.$jobid every line in PRELIMINARY. # fetch that contains one of the tokens in NAMEFILE.$jobid. # However, since grep can only search for one token at a time, # iterative use of grep makes the lines in TRUE.$jobid appear out # of order, w.r.t. to the .ind file. For efficiency, we want # FOUND.$jobid to list the lines in the order they appear in the # .ind file. Therefore, we run fgrep through PRELIMINARY.$jobid, # searching for all lines appearing in TRUE.$jobid. os.nice(10) found_lines = [] h_divind = open(os.path.join(PIR, div + ".ind", "r")) # fgrep -i -f NAMEFILE.$jobid $PIR/$div.ind > PRELIMINARY.$jobid for line in h_divind: for name in namefile_lines: if re.search(name, line, re.IGNORECASE) and re.search("\b" + token + "\b", line): found_lines += line h_divind.close() os.nice(0) # Retrieve sequences, if any were found. #retrieve the entries using getloc if len(found_lines) == 0: if interactive: print "No matches found in " + os.path.join(PIR, div + ".ind") else: h_found = open("FOUND." + jobid, "w") h_found.writelines(found_lines) h_found.close() success = True if interactive: print "Retrieving entries from " + os.path.join(PIR, div) + " ..." os.nice(10) #note: NBRF does _NOT_ have the PIR entries sorted! Entries are listed #in the index in the order in which they appear in the database. if whattoget == "b": extension = ".pir" subprocess.call(["getloc", getloc_options, "-f", "FOUND." + jobid, \ os.path.join(PIR, div + ".ano"), os.path.join(PIR, div + ".wrp"), \ os.path.join(PIR, div + ".ind")]) elif whattoget == "a": extension = ".ano" subprocess.call(["getloc", getloc_options, "-f", "FOUND." + jobid, \ os.path.join(PIR, div + ".ano"), os.path.join(PIR, div + ".ind")]) elif whattoget == "s": extension = ".wrp" subprocess.call(["getloc", getloc_options, "-f", "FOUND." + jobid, \ os.path.join(PIR, div + ".wrp"), os.path.join(PIR, div + ".ind")]) os.nice(0) #If final output goes to a single file, #write the entries to a file in the next highest directory, and #get rid of the temporary directory if files != "f": if success: h_outfile = open(os.path.join(STARTDIR, outfile)) for file in os.listdir(os.getcwd()): if os.path.splitext(file)[1] == extension: birchscript.cat_to(file, h_outfile) h_outfile.close() os.chdir(STARTDIR) shutil.rmtree(tempdir) exit() #----------------------------User-defined PIR/NBRF -------------------- elif database == "P": # If dataset is not split, split it base = os.path.splitext(dbname)[0] dbextension = os.path.splitext(dbname)[1] if dbextension == ".pir": #PIR needtosplit = True base = "TMP" + jobid os.nice(NICE) subprocess.call(["splitdb", "-p", dbname, base + ".ano", base + ".wrp", base + ".ind"]) os.nice(0) else: needtosplit = False # If final output goes to a single file, # make a temporary directory to store retrieved entries if files != "f": base = os.path.join(os.getcwd(), base) os.mkdir(tempdir) shutil.move("NAMEFILE." + jobid, os.path.join(tempdir, "NAMEFILE." + jobid)) os.chdir(tempdir) #retrieve entries success = False #find the accession numbers if interactive: print "Searching index file " + base + ".ind ..." # Do a more careful screening of the preliminary hits, making sure # that each hit appeared as a separate token, and not as part of # a longer token. This time, it's not as simple as for GenBank. # The for loop writes to TRUE.$jobid every line in PRELIMINARY. # fetch that contains one of the tokens in NAMEFILE.$jobid. # However, since grep can only search for one token at a time, # iterative use of grep makes the lines in TRUE.$jobid appear out # of order, w.r.t. to the .ind file. For efficiency, we want # FOUND.$jobid to list the lines in the order they appear in the # .ind file. Therefore, we run fgrep through PRELIMINARY.$jobid, # searching for all lines appearing in TRUE.$jobid. os.nice(10) h_baseind = open(base + ".ind", "r") found_lines = [] # fgrep -i -f NAMEFILE.$jobid > PRELIMINARY.$jobid # grep -w -i $token PRELIMINARY.$jobid >> TRUE.$jobid for line in h_baseind: for name in namefile_lines: if re.search(name, line, re.IGNORECASE) and re.search("\b" + name + "\b", line, re.IGNORECASE): found_lines += line h_baseind.close() os.nice(0) # Retrieve sequences, if any were found. #retrieve the entries using getloc if len(found_lines) == 0: if interactive: print "No matches found in " + base + ".ind" else: success = True if interactive: print "Retrieving entries from " + dbname + " ..." #note: NBRF does _NOT_ have the PIR entries sorted! Entries are listed #in the index in the order in which they appear in the database. os.nice(10) if whattoget == "b": extension = ".pir" subprocess.call(["getloc", getloc_options, "-f", "FOUND." + jobid, base + ".ano", base + ".wrp", base + ".ind"]) elif whattoget == "a": extension = ".ano" subprocess.call(["getloc", getloc_options, "-f", "FOUND." + jobid, base + ".ano", base + ".ind"]) elif whattoget == "s": extension = ".wrp" subprocess.call(["getloc", getloc_options, "-f", "FOUND." + jobid, base + ".wrp", base + ".ind"]) os.nice(0) #If final output goes to a single file, #write the entries to a file in the next highest directory, and #get rid of the temporary directory if files != "f": if success: h_outfile = open(os.path.join(STARTDIR, outfile)) for file in os.listdir(os.getcwd()): if os.path.splitext(file)[1] == extension: birchscript.cat_to(file, h_outfile) h_outfile.close() os.chdir(STARTDIR) shutil.rmtree(tempdir) exit() if needtosplit: os.remove(base + ".ano") os.remove(base + ".wrp") os.remove(base + ".ind") ##################################################################### # Clean up. #################################################################### for file in os.listdir(os.getcwd()): if os.path.splitext(file)[1] == ("." + jobid) and os.path.isfile(file): os.remove(os.path.join(os.getcwd(), file))