#!/bin/csh # FINDKEY - a c-shell script (version 24 Feb 07) # FINDKEY is a front end for IDENTIFY. It provides an interactive # menu interface, as well as overseeing the process of locating database # entries by keywords. # # Revisions: # # 24 Feb 07 Added a ulimit command to run xylem_identify, which # sometimes goes into an infinite loop. This is related # to errors in input files, but it's not obvious yet # where the actual problem is. # 26 Jul 02 Removed support for GenBank RNA division (obsolete) # 26 Jul 02 Modified to call xylem_identify, rather than identify, # to avoid potential name conflicts # 15 Feb 02 Run identify using nice to minimize impact of # runaway jobs. More careful checking is needed # in the future. # 15 Feb 02 Support for VecBase removed # 13 Mar 97 Added GSS and HTG divisions # ##################################################################### # set default parameters #################################################################### # It is assumed that the following environment variables have been set # by .cshrc or .login: # GB - GenBank directory # PIR - PIR/NBRF directory # Running findkey on a remote host # If your databases are on a different host, you can still run findkey # locally, and it will use rsh to run a findkey job on the remote host. # To facilitate remote execution, you must set the environment variable # XYLEM_RHOST to the name of the remote host, and XYLEM_USERID to your # userid on that remote host (should be done in .cshrc) set keyword = "" set keyfile = "" set dbfile = "" set namefile = "" set findfile = "" set wheretolook = "p" set mode = interactive set options = "" #string to hold command line options for getloc set jobid = $$ set runlocal = "" # runlocal = y prevents remote execution as # specified by XYLEM_RHOST. limit cputime 3600 set NICE = 'nice +8 ' # Platform-specific syntax is chosen based on XYLEM_PLATFORM. # (default = sun) if !(${?XYLEM_PLATFORM}) set XYLEM_PLATFORM = sun #Sun4, Sparcstations switch ($XYLEM_PLATFORM) case "sun": set RM = "/usr/bin/rm -f" breaksw case "HP": set RM = "rm -f" breaksw default: set RM = "rm -f" breaksw endsw ##################################################################### # Read options from the command line and set parameters, # or prompt user for parameters at terminal. #################################################################### set numargs = $#argv if ($numargs != 0) then #--------------------------------------------------- #parameters given in command line set mode = command set index = 1 while ($index <= $numargs) set a = $argv[$index] switch ($a) case "-p": set wheretolook = p breaksw case "-G": set wheretolook = G @ index++ set dbfile = $argv[$index] breaksw case "-P": set wheretolook = P @ index++ set dbfile = $argv[$index] breaksw case "-b": set wheretolook = b breaksw case "-m": set wheretolook = m breaksw case "-g": set wheretolook = g breaksw case "-r": set wheretolook = r breaksw case "-d": set wheretolook = d breaksw case "-u": set wheretolook = u breaksw case "-t": set wheretolook = t breaksw case "-z": set wheretolook = z breaksw case "-i": set wheretolook = i breaksw case "-l": set wheretolook = l breaksw case "-s": set wheretolook = s breaksw case "-a": set wheretolook = a breaksw case "-x": set wheretolook = x breaksw case "-e": set wheretolook = e breaksw case "-S": set wheretolook = S breaksw case "-h": set wheretolook = h breaksw case "-L": set runlocal = y breaksw case "-h": echo 'Usage: findkey [-pbmgrdutielsaxShzL] keywordfile [namefile findfile]' echo ' findkey [-P PIR_dataset] keywordfile [namefile findfile]' echo ' findkey [-G GenBank_dataset] keywordfile [namefile findfile]' breaksw default: if ($keyfile == "") then set keyfile = $a else if ($namefile == "") then set namefile = $a else if ($findfile == "") set findfile = $a endif endif breaksw endsw @ index++ end #while if ($keyfile == "") then echo 'No keyfile specified' exit endif else #--------------------------------------------------------------- # Interactive parameter input set complete = 0 while ($complete == 0) #Display current parameter settings echo '___________________________________________________________________' echo ' FINDKEY - Version 26 Jul 02' echo ' Please cite: Fristensky (1993) Nucl. Acids Res. 21:5997-6003' echo '___________________________________________________________________' echo 'Keyfile:' $keyfile echo 'Dataset:' $dbfile echo '-------------------------------------------------------------------' echo ' Parameter Description Value' echo '-------------------------------------------------------------------' echo '1) Keyword Keyword to find '$keyword echo '2) Keyfile Get list of keywords from Keyfile' echo '3) WhereToLook p:PIR '$wheretolook echo ' GenBank - b:bacterial i:invertebrate' echo ' m:mamalian e:expressed seq. tag' echo ' g:phage l:plant' echo ' r:primate ' echo ' d:rodent s:synthetic' echo ' u:unannotated a:viral' echo ' t:vertebrate x:patented' echo ' z:STS S:Genome Survey Seq.' echo ' h:HTG' echo ' G: GenBank dataset P: PIR dataset' echo ' -------------------------------------------------------------' echo ' Type number of your choice or 0 to continue:' #Prompt for parameter to change set paramnum = $< switch ($paramnum) case 0: if ($keyword != "" | $keyfile != "") then set complete = 1 else echo '>>> Must specify value for Keyword or Keyfile' endif breaksw case 1: echo 'Type keyword to search for:' set keyword = $< set keyfile = "" breaksw case 2: echo 'Name of file containing keywords to search for:' set keyfile = $< set keyword = "" breaksw case 3: echo 'Choose one of {pevbimoglrndsuatGP}' set temp = $< if ($temp == p | $temp == b | $temp == i | $temp == m | $temp == e | \ $temp == g | $temp == l | $temp == r | $temp == d | $temp == s | \ $temp == u | $temp == a | $temp == t | $temp == z | $temp == G | $temp == P | $temp == S | $temp == h ) then if ($temp == G | $temp == P) then echo 'Name of file containing user-defined dataset:' set dbfile = $< endif set wheretolook = $temp else echo '>>> Invalid choice' endif breaksw default: endsw #If parameter chosen is 0, and a minimal set of parameters are #set, terminate the loop (complete=1) end #while endif # findkey -L forces local execution, overriding XYLEM_RHOST. This # is necessary to prevent an infinite chain of calls to findkey # on different hosts. if (${?XYLEM_RHOST} && $wheretolook != G && $wheretolook != P && $runlocal != y) then ##################################################################### # Run FINDKEY remotely, if XYLEM_RHOST and XYLEM_USERID are set #################################################################### # Remote hosts can be chosen by having a script called choosehost # in your bin directory. choosehost returns the name of a remote # host. While one possible implementation of choosehost is provided # with XYLEM, choosehost can be tailored to your particular # configuration of servers. if ($XYLEM_RHOST == choosehost) set XYLEM_RHOST = `choosehost` set tempname = "TMP_$jobid" set remotefn = $XYLEM_USERID@$XYLEM_RHOST\:$tempname if ($mode == interactive) echo "Copying keyword file to $remotefn.kw ..." if ($keyfile == "") then echo $keyword > $tempname.kw rcp $tempname.kw $remotefn.kw $RM $tempname.kw else rcp $keyfile $remotefn.kw endif if ($mode == interactive) echo "Running FINDKEY remotely on $XYLEM_RHOST as"\ "user $XYLEM_USERID ..." rsh $XYLEM_RHOST -l $XYLEM_USERID findkey -L -$wheretolook $tempname.kw \ $tempname.nam $tempname.fnd if $keyfile == "" then set keyname = $keyword else set keyname = $keyfile:r endif if ($namefile == "") set namefile = $keyname.nam if ($findfile == "") set findfile = $keyname.fnd if ($mode == interactive) echo Copying $remotefn.nam to $namefile ... rcp $remotefn.nam $namefile if ($mode == interactive) echo Copying $remotefn.fnd to $findfile ... rcp $remotefn.fnd $findfile if ($mode == interactive) echo Removing temporary files... rsh $XYLEM_RHOST -l $XYLEM_USERID $RM "$tempname.*" else ##################################################################### # For a given database, search annotation files for all entries # containing the specified keyword(s). #################################################################### # If only one keyword is present in keyfile, copy it to $keyword # This will cause FINDKEY to use egrep, which is faster than fgrep. if ($keyfile != "") then if (`wc -l <$keyfile` == 1) then set keyword = `cat $keyfile` set keyfile = "" endif endif if ($wheretolook == p) then #------------------------ PIR/NBRF -------------------------- set divisions = (pir1 pir2 pir3 pir4) foreach div ($divisions) set base = $PIR/$div if ($mode == interactive) echo "Searching $base.ano..." # egrep through the .ano file if ($keyfile == "") then set key = $keyword nice egrep -i -n -e $keyword $base.ano > $jobid.grep else set key = $keyfile:r nice fgrep -i -n -f $keyfile $base.ano > $jobid.grep endif if ($namefile == "") set namefile = $key~pir.nam if ($findfile == "") set findfile = $key~pir.fnd # identify each sequence found # temporarily store in $jobid..nam & $jobid..fnd, and then append # to $namefile and $findfile if (-z $jobid.grep) then if ($mode == interactive) echo "No matches found in $base.ano" else if ($mode == interactive) then echo "Sequence names will be written to $namefile" echo "Lines containing keyword(s) will be written to $findfile" endif $NICE xylem_identify $jobid.grep $base.ind $jobid.nam $jobid.fnd cat $jobid.nam >> $namefile cat $jobid.fnd >> $findfile $RM $jobid.* endif end else if ($wheretolook == G | $wheretolook == P) then #---------------- User-defined dataset ------------------ # If dataset is not split, split it set base = $dbfile:r set dbextension = $dbfile:e if ($dbextension == "gen") then #GenBank set needtosplit = true set base = TMP$jobid splitdb $dbfile $base.ano $base.wrp $base.ind else if ($dbextension == "pir") then #PIR set needtosplit = true set base = TMP$jobid splitdb -p $dbfile $base.ano $base.wrp $base.ind else set needtosplit = false endif endif if ($mode == interactive) echo "Searching $base.ano..." # egrep through the .ano file if ($keyfile == "") then set key = $keyword nice egrep -i -n -e $keyword $base.ano > $jobid.grep else set key = $keyfile:r nice fgrep -i -n -f $keyfile $base.ano > $jobid.grep endif if ($namefile == "") set namefile = $key~$base.nam if ($findfile == "") set findfile = $key~$base.fnd if (-z $jobid.grep) then echo "No matches found in $base.ano" else if ($mode == interactive) then echo "Sequence names will be written to $namefile" echo "Lines containing keyword(s) will be written to $findfile" endif $NICE xylem_identify $jobid.grep $base.ind $namefile $findfile endif if ($needtosplit == true) $RM $base.ano $base.wrp $base.ind else #------------------------ GenBank ------------------ # Set $div to the name of the database division to search switch ($wheretolook) case b: set div = bct breaksw case m: set div = mam breaksw case g: set div = phg breaksw case r: set div = pri breaksw case d: set div = rod breaksw case u: set div = una breaksw case t: set div = vrt breaksw case z: set div = sts breaksw case i: set div = inv breaksw case l: set div = pln breaksw case s: set div = syn breaksw case a: set div = vrl breaksw case x: set div = pat breaksw case e: set div = est breaksw case S: set div = gss breaksw case h: set div = htg breaksw default: endsw # $base is the name of the database file, without the extension # Most GenBank divisions are present in one file eg. gbrna. # Large GenBank divisions such as EST and Primate are split # eg. gbest1, gbest2, gbest3... # Regardless of how many divisions there are, BASESET creates # the list of all files for that division. if (-e $GB/gb$div.ind) then set BASESET = $GB/gb$div else set index = 1 set BASESET = () while (-e $GB/gb$div$index.ind) set BASESET = ($BASESET $GB/gb$div$index) @ index++ end # while endif #use grep to find the keyword(s), and then use xylem_identify to determine which #entries correspond to the occurrences of the keyword(s) found foreach division ($BASESET) if (-e $division.ind) then set base = $division if ($mode == interactive) echo "Searching $base.ano..." if ($keyfile == "") then set key = $keyword nice egrep -i -n -e $keyword $base.ano > $jobid.grep else set key = $keyfile:r nice fgrep -i -n -f $keyfile $base.ano > $jobid.grep endif if ($namefile == "") set namefile = $key~$div.nam if ($findfile == "") set findfile = $key~$div.fnd if (-z $jobid.grep) then if ($mode == interactive) echo "No matches found in $base.ano" else if ($mode == interactive) then echo "Sequence names will be written to $namefile" echo "Lines containing keyword(s) will be written to $findfile" endif $NICE xylem_identify $jobid.grep $base.ind $jobid.nam $jobid.fnd endif cat $jobid.nam >> $namefile cat $jobid.fnd >> $findfile endif end endif endif endif ##################################################################### # Clean up. #################################################################### $RM $jobid.*