#!/usr/bin/env python import birchscript import re import os import os.path import subprocess import sys # FINDKEY - a c-shell script (version 24 Feb 07) # FINDKEY is a front end for IDENTIFY. It provides an interactive # menu interface, as well as overseeing the process of locating database # entries by keywords. # # Revisions: # # 24 Feb 07 Added a ulimit command to run xylem_identify, which # sometimes goes into an infinite loop. This is related # to errors in input files, but it's not obvious yet # where the actual problem is. # 26 Jul 02 Removed support for GenBank RNA division (obsolete) # 26 Jul 02 Modified to call xylem_identify, rather than identify, # to avoid potential name conflicts # 15 Feb 02 Run identify using nice to minimize impact of # runaway jobs. More careful checking is needed # in the future. # 15 Feb 02 Support for VecBase removed # 13 Mar 97 Added GSS and HTG divisions # ##################################################################### # set default parameters #################################################################### # It is assumed that the following environment variables have been set # by .cshrc or .login: # GB - GenBank directory # PIR - PIR/NBRF directory # Running findkey on a remote host # If your databases are on a different host, you can still run findkey # locally, and it will use rsh to run a findkey job on the remote host. # To facilitate remote execution, you must set the environment variable # XYLEM_RHOST to the name of the remote host, and XYLEM_USERID to your # userid on that remote host (should be done in .cshrc) keyword = "" keyfile = "" dbfile = "" namefile = "" findfile = "" wheretolook = "p" mode = "interactive" options = "" #string to hold command line options for getloc jobid = os.getcwd() forcelocal = False # forcelocal prevents remote execution as # specified by XYLEM_RHOST. needtosplit = True if os.environ.has_key("XYLEM_PLATFORM"): XYLEM_PLATFORM = os.environ.get("XYLEM_PLATFORM") else: XYLEM_PLATFORM = "sun" #Sun4, Sparcstations if XYLEM_PLATFORM == "sun": RM = "/usr/bin/rm -f" else: RM = "rm -f" NICE = 8 # after 3600 seconds (1 hour), if the script is still running, it will automatically exit # equivalent to limit 3600 t = Timer(3600, exit) t.start() ##################################################################### # Read options from the command line and set parameters, # or prompt user for parameters at terminal. #################################################################### if len(sys.argv) != 1: #--------------------------------------------------- #parameters given in command line mode = "command" arg_index = 1 while arg_index <= len(sys.argv): argtest = sys.argv[arg_index] if argtest == "-p": wheretolook = "p" elif argtest == "-G": wheretolook = "G" arg_index += 1 dbfile = sys.argv[arg_index] elif argtest == "-P": wheretolook = "P" arg_index += 1 dbfile = sys.argv[arg_index] elif argtest in ("-b", "-m", "-g", "-r", "-d", "-u", "-t", "-z", "-i", "-l", "-s", "-a", "-x", "-e", "-S", "-h"): wheretolook = argtest[1] elif argtest == "-L": forcelocal = True elif argtest == "-h": print 'Usage: findkey [-pbmgrdutielsaxShzL] keywordfile [namefile findfile]' print ' findkey [-P PIR_dataset] keywordfile [namefile findfile]' print ' findkey [-G GenBank_dataset] keywordfile [namefile findfile]' else: if keyfile == "": keyfile = argtest elif namefile == "": namefile = argtest elif findfile == "": findfile = argtest arg_index += 1 # ends while if keyfile == "": print 'No keyfile specified' exit() else: #--------------------------------------------------------------- # Interactive parameter input complete = False while not complete: #Display current parameter settings print '___________________________________________________________________' print ' FINDKEY - Version 26 Jul 02' print ' Please cite: Fristensky (1993) Nucl. Acids Res. 21:5997-6003' print '___________________________________________________________________' print 'Keyfile: ' + keyfile print 'Dataset: ' + dbfile print '-------------------------------------------------------------------' print ' Parameter Description Value' print '-------------------------------------------------------------------' print '1) Keyword Keyword to find ' + keyword print '2) Keyfile Get list of keywords from Keyfile' print '3) WhereToLook p:PIR ' + wheretolook print ' GenBank - b:bacterial i:invertebrate' print ' m:mamalian e:expressed seq. tag' print ' g:phage l:plant' print ' r:primate ' print ' d:rodent s:synthetic' print ' u:unannotated a:viral' print ' t:vertebrate x:patented' print ' z:STS S:Genome Survey Seq.' print ' h:HTG' print ' G: GenBank dataset P: PIR dataset' print ' -------------------------------------------------------------' #Prompt for parameter to change paramnum = raw_input(' Type number of your choice or 0 to continue:') if paramnum == 0: if keyword != "" or keyfile != "": complete = True else: print '>>> Must specify value for Keyword or Keyfile' elif paramnum == 1: keyword = raw_input('Type keyword to search for:') keyfile = "" elif paramnum == 2: keyfile = raw_input('Name of file containing keywords to search for:') keyword = "" elif paramnum == 3: temp = raw_input('Choose one of {pevbimoglrndsuatGP}') if temp in ("p", "b", "i", "m", "e", "g", "l", "r", "d", "s", "u", "a", "t", "z", "G", "P ", "S", "h"): if temp in ("G", "P"): dbfile = raw_input('Name of file containing user-defined dataset:') wheretolook = temp else: print '>>> Invalid choice' #If parameter chosen is 0, and a minimal set of parameters are #set, terminate the loop (complete=True) #end while # findkey -L forces local execution, overriding XYLEM_RHOST. This # is necessary to prevent an infinite chain of calls to findkey # on different hosts. if os.environ.has_key("XYLEM_RHOST") and wheretolook not in ("G", "P") and not forcelocal: ##################################################################### # Run FINDKEY remotely, if XYLEM_RHOST and XYLEM_USERID are set #################################################################### # Remote hosts can be chosen by having a script called choosehost # in your bin directory. choosehost returns the name of a remote # host. While one possible implementation of choosehost is provided # with XYLEM, choosehost can be tailored to your particular # configuration of servers. XYLEM_RHOST = os.environ.get("XYLEM_RHOST") if XYLEM_RHOST == "choosehost": XYLEM_RHOST = birchscript.choosehost() XYLEM_USERID = os.environ.get("XYLEM_USERID") tempname = "TMP_" + jobid remotefn = XYLEM_USERID + "@" + XYLEM_RHOST + ":" + tempname if mode == "interactive": print "Copying keyword file to $remotefn.kw ..." if keyfile == "": h_tempnamekw = open(tempname + ".kw", "w") print >> h_tempnamekw, keyword h_tempnamekw.close() subprocess.call(["rcp", tempname + ".kw", remotefn + ".kw"]) os.remove(tempname + ".kw") else: subprocess.call(["rcp", keyfile + ".kw", remotefn + ".kw"]) if mode == "interactive": print "Running FINDKEY remotely on $XYLEM_RHOST as user $XYLEM_USERID ..." subprocess.call(["rsh", XYLEM_RHOST, "-l", XYLEM_USERID, "findkey", "-L", "-" + wheretolook, tempname + ".kw", \ tempname + ".nam", tempname + ".fnd"]) if keyfile == "": keyname = keyword else: keyname = os.path.splitext(keyfile)[0] if namefile == "": namefile = keyname + ".nam" if findfile == "": findfile = keyname + ".fnd" if mode == "interactive": print "Copying $remotefn.nam to " + namefile + " ..." subprocess.call(["rcp", remotefn + ".nam", namefile]) if mode == "interactive": print "Copying " + remotefn + ".fnd to $findfile ..." subprocess.call(["rcp", remotefn + ".fnd", findfile]) if mode == "interactive": print "Removing temporary files..." subprocess.call(["rsh", XYLEM_RHOST, "-l", XYLEM_USERID, RM, "tempname.*"]) else: ##################################################################### # For a given database, search annotation files for all entries # containing the specified keyword(s). #################################################################### # If only one keyword is present in keyfile, copy it to $keyword # This will cause FINDKEY to use egrep, which is faster than fgrep. if keyfile != "": h_keyfile = open("keyfile", "r") keyfile_lines = h_keyfile.readlines() if len(keyfile_lines) == 1: keyword = keyfile_lines[0] keyfile = "" h_keyfile.close() if wheretolook == "p": #------------------------ PIR/NBRF -------------------------- for div in ["pir1", "pir2", "pir3", "pir4"]: base = os.path.join(PIR, div) if mode == "interactive": print "Searching $base.ano..." # egrep through the .ano file key = egrepper(keyword, keyfile, base, jobid) if namefile == "": namefile = key + "~pir.nam" if findfile == "": findfile = key + "~pir.fnd" # identify each sequence found # temporarily store in $jobid..nam & $jobid..fnd, and then append # to $namefile and $findfile if not os.file.exists(jobid + ".grep"): if mode == "interactive": print "No matches found in $base.ano" else: if mode == "interactive": print "Sequence names will be written to " + namefile print "Lines containing keyword(s) will be written to " + findfile os.nice(NICE) subprocess.call(["xylem_identify", jobid + ".grep", base + ".ind", jobid + ".nam", jobid + ".fnd"]) os.nice(0) h_namefile = open(namefile, "a") birchscript.cat_to(jobid + ".nam", h_namefile) h_namefile.close() h_findfile = open(findfile, "a") birchscript.cat_to(jobid + ".fnd", h_findfile) h_findfile.close() for filename in os.listdir(os.getcwd()): if (jobid + ".") in filename and os.path.exists(filename): os.remove(os.path.join(os.getcwd(), filename)) else: if wheretolook in ("G", "P"): #---------------- User-defined dataset ------------------ # If dataset is not split, split it base = os.path.splitext(dbfile)[0] dbextension = os.path.splitext(dbfile)[1] if dbextension == ".gen": #GenBank needtosplit = True base = "TMP" + jobid subprocess.call(["splitdb", dbfile, base + ".ano", base + ".wrp", base + ".ind"]) else: if dbextension == ".pir": #PIR needtosplit = True base = "TMP" + jobid subprocess.call(["splitdb", "-p", dbfile, base + ".ano", base + ".wrp", base + ".ind"]) else: needtosplit = False if mode == "interactive": print "Searching $base.ano..." # egrep through the .ano file key = egrepper(keyword, keyfile, base, jobid) if namefile == "": namefile = key + "~pir.nam" if findfile == "": findfile = key + "~pir.fnd" if not os.path.exists(jobid + ".grep"): print "No matches found in " + base + ".ano" else: if mode == "interactive": print "Sequence names will be written to $namefile" print "Lines containing keyword(s) will be written to $findfile" os.nice(NICE) subprocess.call(["xylem_identify", jobid + ".grep", base + ".ind", namefile, findfile]) os.nice(0) if needtosplit: os.remove(base + ".ano") os.remove(base + ".wrp") os.remove(base + ".ind") else: #------------------------ GenBank ------------------ # Set $div to the name of the database division to search divisions = { "a": "vrl", "b": "bct", "d": "rod", "e": "est",\ "g": "phg", "h": "htg", "i": "inv", "l": "pln",\ "m": "mam", "r": "pri", "s": "syn", "S": "gss",\ "t": "vrt", "u": "una", "x": "pat", "z": "sts"} if divisions.has_key(wheretolook): div = divisions[wheretolook] # $base is the name of the database file, without the extension # Most GenBank divisions are present in one file eg. gbrna. # Large GenBank divisions such as EST and Primate are split # eg. gbest1, gbest2, gbest3... # Regardless of how many divisions there are, BASESET creates # the list of all files for that division. if os.path.exists(os.path.join(GB, "gb" + div + ".ind")): BASESET = os.path.join(GB, "gb" + div) else: index = 1 BASESET = [] while os.path.exists(os.path.join(GB, "gb" + div + index + ".ind")): BASESET.append(os.path.join(GB, "gb" + div + index + ".ind")) index += 1 #use grep to find the keyword(s), and then use xylem_identify to determine which #entries correspond to the occurrences of the keyword(s) found for division in BASESET: if os.path.exists(division + ".ind"): base = division if mode == "interactive": print "Searching $base.ano..." # egrep through the .ano file key = egrepper(keyword, keyfile, base, jobid) if namefile == "": namefile = key + "~" + div + ".nam" if findfile == "": findfile = key + "~" + div + ".fnd" if not os.path.exists(jobid + ".grep"): if mode == "interactive": print "No matches found in $base.ano" else: if mode == "interactive": print "Sequence names will be written to $namefile" print "Lines containing keyword(s) will be written to $findfile" os.nice(NICE) subprocess.call(["xylem_identify", jobid + ".grep", base + ".ind", jobid + ".nam", jobid + ".fnd"]) os.nice(0) h_namefile = open(namefile, "a") birchscript.cat_to(jobid + ".nam", h_namefile) h_namefile.close() h_findfile = open(findfile, "a") birchscript.cat_to(jobid + ".fnd", h_findfile) h_findfile.close() ##################################################################### # Clean up. #################################################################### for filename in os.listdir(os.getcwd()): if (jobid + ".") in filename and os.path.exists(filename): os.remove(os.path.join(os.getcwd(), filename)) def egrepper(keyword, keyfile, base, jobid): os.nice(10) h_baseano = open(base + ".ano", "r") baseanolines = h_baseano.readlines() h_baseano.close() h_result = open(jobid + ".grep", "w") # egrep through the .ano file if keyfile == "": key = keyword # egrep -i -n -e $keyword $base.ano > $jobid.grep for line in baseanolines: if re.search(keyword): h_result.write(line) else: key = os.path.splitext(keyfile)[0] # fgrep -i -n -f $keyfile $base.ano > $jobid.grep h_keyfile = open(keyfile, "r") for kw in keyfile: for line in baseanolines: if re.search(kw): h_result.write(line) h_keyfile.close() h_result.close() os.nice(0) return key