#!/usr/bin/env python

import birchscript
import re
import os
import os.path
import subprocess
import sys

# FINDKEY - a c-shell script (version 24 Feb 07)
# FINDKEY is a front end for IDENTIFY. It provides an interactive
# menu interface, as well as overseeing the process of locating database
# entries by keywords.
#
# Revisions:
#
#    24 Feb 07  Added a ulimit command to run xylem_identify, which
#               sometimes goes into an infinite loop. This is related
#               to errors in input files, but it's not obvious yet
#               where the actual problem is.
#    26 Jul 02  Removed support for GenBank RNA division (obsolete)
#    26 Jul 02  Modified to call xylem_identify, rather than identify,
#               to avoid potential name conflicts
#    15 Feb 02  Run identify using nice to minimize impact of
#               runaway jobs. More careful checking is needed 
#               in the future.
#    15 Feb 02  Support for VecBase removed
#    13 Mar 97  Added GSS and HTG divisions
#
#####################################################################
# set default parameters
####################################################################
# It is assumed that the following environment variables have been set
# by .cshrc or .login:
#   GB  - GenBank directory
#   PIR - PIR/NBRF directory

# Running findkey on a remote host
# If your databases are on a different host, you can still run findkey
# locally, and it will use rsh to run a findkey job on the remote host.
# To facilitate remote execution, you must set the environment variable
# XYLEM_RHOST to the name of the remote host, and XYLEM_USERID to your
# userid on that remote host (should be done in .cshrc)

keyword     = ""
keyfile     = ""
dbfile      = ""
namefile    = ""
findfile    = ""
wheretolook = "p"
mode        = "interactive"
options     = "" #string to hold command line options for getloc
jobid       = os.getcwd()
forcelocal  = False  # forcelocal prevents remote execution as
                     # specified by XYLEM_RHOST.
needtosplit = True

if os.environ.has_key("XYLEM_PLATFORM"):
    XYLEM_PLATFORM = os.environ.get("XYLEM_PLATFORM")
else:
    XYLEM_PLATFORM = "sun" #Sun4, Sparcstations

if XYLEM_PLATFORM == "sun":
    RM = "/usr/bin/rm -f"
else:
    RM = "rm -f"

NICE = 8

# after 3600 seconds (1 hour), if the script is still running, it will automatically exit
# equivalent to limit 3600
t = Timer(3600, exit)
t.start()

#####################################################################
# Read options from the command line and set parameters,
# or prompt user for parameters at terminal.
####################################################################
if len(sys.argv) != 1:
    #---------------------------------------------------
    #parameters given in command line
    mode  = "command"
    arg_index = 1
    while arg_index <= len(sys.argv):
        argtest = sys.argv[arg_index]
        if argtest == "-p":
            wheretolook = "p"
        elif argtest == "-G":
            wheretolook = "G"
            arg_index += 1
            dbfile = sys.argv[arg_index]
        elif argtest == "-P":
            wheretolook = "P"
            arg_index += 1
            dbfile = sys.argv[arg_index]
        elif argtest in ("-b", "-m", "-g", "-r", "-d", "-u", "-t", "-z", "-i", "-l", "-s", "-a", "-x", "-e", "-S", "-h"):
            wheretolook = argtest[1]
        elif argtest == "-L":
            forcelocal = True
        elif argtest == "-h":
            print 'Usage: findkey [-pbmgrdutielsaxShzL] keywordfile [namefile findfile]'
            print '       findkey [-P PIR_dataset] keywordfile [namefile findfile]'
            print '       findkey [-G GenBank_dataset] keywordfile [namefile findfile]'
        else:
            if keyfile == "":
                keyfile = argtest
            elif namefile == "":
                namefile = argtest
            elif findfile == "":
                findfile = argtest
        arg_index += 1
    # ends while

    if keyfile == "":
        print 'No keyfile specified'
        exit()
else:
    #---------------------------------------------------------------
    # Interactive parameter input
    complete = False
    while not complete:
        #Display current parameter settings
        print '___________________________________________________________________'
        print '                     FINDKEY - Version 26 Jul 02'
        print '    Please cite: Fristensky (1993) Nucl. Acids Res. 21:5997-6003'
        print '___________________________________________________________________'
        print 'Keyfile: ' + keyfile
        print 'Dataset: ' + dbfile
        print '-------------------------------------------------------------------'
        print '   Parameter              Description                      Value'
        print '-------------------------------------------------------------------'
        print '1) Keyword     Keyword to find                               ' + keyword
        print '2) Keyfile     Get list of keywords from Keyfile'
        print '3) WhereToLook p:PIR                                         ' + wheretolook
        print '               GenBank - b:bacterial     i:invertebrate'
        print '                         m:mamalian      e:expressed seq. tag'
        print '                         g:phage         l:plant'
        print '                         r:primate       '
        print '                         d:rodent        s:synthetic'
        print '                         u:unannotated   a:viral'
        print '                         t:vertebrate    x:patented'
        print '                         z:STS           S:Genome Survey Seq.'
        print '                         h:HTG'
        print '               G: GenBank dataset        P: PIR dataset'
        print '   -------------------------------------------------------------'

        #Prompt for parameter to change
        paramnum = raw_input('   Type number of your choice or 0 to continue:')

        if paramnum == 0:
            if keyword != "" or keyfile != "":
                complete = True
            else:
                print '>>> Must specify value for Keyword or Keyfile'
        elif paramnum == 1:
            keyword = raw_input('Type keyword to search for:')
            keyfile = ""
        elif paramnum == 2:
            keyfile = raw_input('Name of file containing keywords to search for:')
            keyword = ""
        elif paramnum == 3:
            temp = raw_input('Choose one of {pevbimoglrndsuatGP}')
            if temp in ("p", "b", "i", "m", "e", "g", "l", "r", "d", "s", "u", "a", "t", "z", "G", "P ", "S", "h"):
                if temp in ("G", "P"):
                    dbfile = raw_input('Name of file containing user-defined dataset:')
                wheretolook = temp
            else:
                print '>>> Invalid choice'
        #If parameter chosen is 0, and a minimal set of parameters are
        #set, terminate the loop (complete=True)
    #end while


# findkey -L forces local execution, overriding XYLEM_RHOST. This 
# is necessary to prevent an infinite chain of calls to findkey
# on different hosts.
if os.environ.has_key("XYLEM_RHOST") and wheretolook not in ("G", "P") and not forcelocal:
    #####################################################################
    #  Run FINDKEY remotely, if XYLEM_RHOST and XYLEM_USERID are set
    ####################################################################
    # Remote hosts can be chosen by having a script called choosehost
    # in your bin directory. choosehost returns the name of a remote
    # host. While one possible implementation of choosehost is provided
    # with XYLEM, choosehost can be tailored to your particular
    # configuration of servers.
    XYLEM_RHOST = os.environ.get("XYLEM_RHOST")
    if XYLEM_RHOST == "choosehost":
        XYLEM_RHOST = birchscript.choosehost()
    XYLEM_USERID = os.environ.get("XYLEM_USERID")

    tempname = "TMP_" + jobid
    remotefn = XYLEM_USERID + "@" + XYLEM_RHOST + ":" + tempname
    if mode == "interactive":
        print "Copying keyword file to $remotefn.kw ..."
    if keyfile == "":
        h_tempnamekw = open(tempname + ".kw", "w")
        print >> h_tempnamekw, keyword
        h_tempnamekw.close()
        subprocess.call(["rcp", tempname + ".kw", remotefn + ".kw"])
        os.remove(tempname + ".kw")
    else:
        subprocess.call(["rcp", keyfile + ".kw", remotefn + ".kw"])

    if mode == "interactive":
        print "Running FINDKEY remotely on $XYLEM_RHOST as user $XYLEM_USERID ..."

    subprocess.call(["rsh", XYLEM_RHOST, "-l", XYLEM_USERID, "findkey", "-L", "-" + wheretolook, tempname + ".kw", \
                                      tempname + ".nam", tempname + ".fnd"])
    if keyfile == "":
        keyname = keyword
    else:
        keyname = os.path.splitext(keyfile)[0]

    if namefile == "":
        namefile = keyname + ".nam"
    if findfile == "":
        findfile = keyname + ".fnd"

    if mode == "interactive":
        print "Copying $remotefn.nam to " + namefile + " ..."
    subprocess.call(["rcp", remotefn + ".nam", namefile])
    if mode == "interactive":
        print "Copying " + remotefn + ".fnd to $findfile ..."
    subprocess.call(["rcp", remotefn + ".fnd", findfile])

    if mode == "interactive":
        print "Removing temporary files..."
    subprocess.call(["rsh", XYLEM_RHOST, "-l", XYLEM_USERID, RM, "tempname.*"])

else:
    #####################################################################
    #  For a given database, search annotation files for all entries
    #  containing the specified keyword(s).
    ####################################################################

    # If only one keyword is present in keyfile, copy it to $keyword
    # This will cause FINDKEY to use egrep, which is faster than fgrep.
    if keyfile != "":
        h_keyfile = open("keyfile", "r")
        keyfile_lines = h_keyfile.readlines()
        if len(keyfile_lines) == 1:
            keyword =  keyfile_lines[0]
            keyfile = ""
        h_keyfile.close()

    if wheretolook == "p":
        #------------------------ PIR/NBRF --------------------------
        for div in ["pir1", "pir2", "pir3", "pir4"]:
            base = os.path.join(PIR, div)

            if mode == "interactive":
                print "Searching $base.ano..."

            # egrep through the .ano file
            key = egrepper(keyword, keyfile, base, jobid)

            if namefile == "":
                namefile = key + "~pir.nam"
            if findfile == "":
                findfile = key + "~pir.fnd"

            # identify each sequence found
            # temporarily store in $jobid..nam & $jobid..fnd, and then append
            # to $namefile and $findfile
            if not os.file.exists(jobid + ".grep"):
                if mode == "interactive":
                    print "No matches found in $base.ano"
            else:
                if mode == "interactive":
                    print "Sequence names will be written to " + namefile
                    print "Lines containing keyword(s) will be written to " + findfile
                os.nice(NICE)
                subprocess.call(["xylem_identify", jobid + ".grep", base + ".ind", jobid + ".nam", jobid + ".fnd"])
                os.nice(0)

                h_namefile = open(namefile, "a")
                birchscript.cat_to(jobid + ".nam", h_namefile)
                h_namefile.close()

                h_findfile = open(findfile, "a")
                birchscript.cat_to(jobid + ".fnd", h_findfile)
                h_findfile.close()

                for filename in os.listdir(os.getcwd()):
                    if (jobid + ".") in filename and os.path.exists(filename):
                        os.remove(os.path.join(os.getcwd(), filename))
    else:
        if wheretolook in ("G", "P"):
            #---------------- User-defined dataset ------------------
            # If dataset is not split, split it
            base = os.path.splitext(dbfile)[0]
            dbextension = os.path.splitext(dbfile)[1]
            if dbextension == ".gen": #GenBank
                needtosplit = True
                base = "TMP" + jobid
                subprocess.call(["splitdb", dbfile, base + ".ano", base + ".wrp", base + ".ind"])
            else:
                if dbextension == ".pir": #PIR
                    needtosplit = True
                    base = "TMP" + jobid
                    subprocess.call(["splitdb", "-p", dbfile, base + ".ano", base + ".wrp", base + ".ind"])
                else:
                    needtosplit = False

            if mode == "interactive":
                print "Searching $base.ano..."

            # egrep through the .ano file
            key = egrepper(keyword, keyfile, base, jobid)

            if namefile == "":
                namefile = key + "~pir.nam"
            if findfile == "":
                findfile = key + "~pir.fnd"

            if not os.path.exists(jobid + ".grep"):
                print "No matches found in " + base + ".ano"
            else:
                if mode == "interactive":
                    print "Sequence names will be written to $namefile"
                    print "Lines containing keyword(s) will be written to $findfile"
                os.nice(NICE)
                subprocess.call(["xylem_identify", jobid + ".grep", base + ".ind", namefile, findfile])
                os.nice(0)
            if needtosplit:
                os.remove(base + ".ano")
                os.remove(base + ".wrp")
                os.remove(base + ".ind")
        else:
            #------------------------  GenBank  ------------------
            # Set $div to the name of the database division to search
            divisions = { "a": "vrl",  "b": "bct",  "d": "rod",  "e": "est",\
                          "g": "phg",  "h": "htg",  "i": "inv",  "l": "pln",\
                          "m": "mam",  "r": "pri",  "s": "syn",  "S": "gss",\
                          "t": "vrt",  "u": "una",  "x": "pat",  "z": "sts"}
            if divisions.has_key(wheretolook):
                div = divisions[wheretolook]

            # $base is the name of the database file, without the extension
            # Most GenBank divisions are present in one file eg. gbrna.
            # Large GenBank divisions such as EST and Primate are split
            # eg. gbest1, gbest2, gbest3...
            # Regardless of how many divisions there are, BASESET creates
            # the list of all files  for that division.
            if os.path.exists(os.path.join(GB, "gb" + div + ".ind")):
                BASESET = os.path.join(GB, "gb" + div)
            else:
                index = 1
                BASESET = []
                while os.path.exists(os.path.join(GB, "gb" + div + index + ".ind")):
                    BASESET.append(os.path.join(GB, "gb" + div + index + ".ind"))
                    index += 1

            #use grep to find the keyword(s), and then use xylem_identify to determine which
            #entries correspond to the occurrences of the keyword(s) found

            for division in BASESET:
                if os.path.exists(division + ".ind"):
                    base = division
                    if mode == "interactive":
                        print "Searching $base.ano..."

                    # egrep through the .ano file
                    key = egrepper(keyword, keyfile, base, jobid)

                    if namefile == "":
                        namefile = key + "~" + div + ".nam"
                    if findfile == "":
                        findfile = key + "~" + div + ".fnd"

                    if not os.path.exists(jobid + ".grep"):
                        if mode == "interactive":
                            print "No matches found in $base.ano"
                    else:
                        if mode == "interactive":
                            print "Sequence names will be written to $namefile"
                            print "Lines containing keyword(s) will be written to $findfile"
                        os.nice(NICE)
                        subprocess.call(["xylem_identify", jobid + ".grep", base + ".ind", jobid + ".nam", jobid + ".fnd"])
                        os.nice(0)

                    h_namefile = open(namefile, "a")
                    birchscript.cat_to(jobid + ".nam", h_namefile)
                    h_namefile.close()

                    h_findfile = open(findfile, "a")
                    birchscript.cat_to(jobid + ".fnd", h_findfile)
                    h_findfile.close()

#####################################################################
#  Clean up.
####################################################################
for filename in os.listdir(os.getcwd()):
    if (jobid + ".") in filename and os.path.exists(filename):
        os.remove(os.path.join(os.getcwd(), filename))

def egrepper(keyword, keyfile, base, jobid):
    os.nice(10)
    h_baseano = open(base + ".ano", "r")
    baseanolines = h_baseano.readlines()
    h_baseano.close()

    h_result  = open(jobid + ".grep", "w")
    # egrep through the .ano file
    if keyfile == "":
        key = keyword
        # egrep -i -n -e $keyword $base.ano > $jobid.grep
        for line in baseanolines:
            if re.search(keyword):
                h_result.write(line)
    else:
        key = os.path.splitext(keyfile)[0]
        # fgrep -i -n -f $keyfile $base.ano > $jobid.grep
        h_keyfile = open(keyfile, "r")
        for kw in keyfile:
            for line in baseanolines:
                if re.search(kw):
                    h_result.write(line)
        h_keyfile.close()
    h_result.close()
    os.nice(0)
    return key