#!/usr/bin/env python3

"""
Dr. Brian Fristensky, University of Manitoba

 Description: Wrapper for running getob

 Synopsis: features.py expression  
           features [-f featurekey | -F keyfile]
                     [-n name     |-a accession    | -e expression |
                      -N namefile |-A accfile      | -E expfile]
                     [-u dbfile   | -U dbfile      | -g ] 
                     [ -o outname ]
            features -h     

@modified: August 2, 2024
@author: Brian Fristensky
@contact: brian.fristensky@umanitoba.ca
"""

import argparse
from pathlib import PurePath
import os.path
import re
import shutil
import subprocess
import sys

PROGRAM = 'features.py: '
USAGE = '''\t USAGE: features.py expression
\t features [-f featurekey | -F keyfile] 
\t[-n name     |-a accession    | -e expression |
\t-N namefile |-A accfile      | -E expfile]
\t[-u dbfile   | -U dbfile      | -g ] 
\t[-o outname ]
'''
PID = str(os.getpid())
DEBUG = True
NL = "\n"


# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
"returns a list of lines from a file. Analogous to Unix cat command."
def Contents(FN) :
    infile = open(FN,"r") 
    textlines = []
    for line in infile.readlines() :
        textlines.append(line.rstrip())
    return textlines    
    infile.close()

# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
"""
Check to see if a file is empty, other than whitespace.
Returns False if any non-blank characters are found.
"""
def Empty(FN) :
    FoundNonBlanks = False
    if os.path.exists(FN) :
        with open(FN,"rb") as infile :
            while True :
                line = infile.readline()
                if not line: # EOF
                    break
                elif len(line.strip()) > 0 : # Found a line with non-blank characters
                    FoundNonBlanks = True
                    break
    else :
        print(">>> features.py: " + FN + ": file not found)")    
    return not FoundNonBlanks

# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
"Write a file of features to be extracted by GETOB."
def WriteFeaFile(FN,FEATURES) :
    feafile = open(FN,"w")
    FEATURES = ["OBJECTS"] + FEATURES + ["SITES"]
    for line in FEATURES :
        feafile.write(line + NL)
    feafile.close()

# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
"Write a file of LOCUS or ACCESSION id's to be processed by GETOB."
def WriteIdFile(FN,ENTRIES) :
    idfile = open(FN,"w")
    for line in ENTRIES :
        idfile.write(line + NL)
    idfile.close()
    
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
"Extract ACCESSION numbers (ie. column 2) from XYLEM index file."
def IndexToAccFile(INDFILE,ACCFILE) :
    indfile = open(INDFILE,"rb")
    accfile = open(ACCFILE,"w")
    for line in indfile.readlines() :
        line = str(line)
        if line[0]  != chr(0) : # splitdb may terminate a file with a line containing nul characters
            tokens = re.sub(' +', ' ',line).split(' ')
            if len(tokens) > 1 :
                accession = tokens[1]
                accfile.write(accession + NL)
    indfile.close() 
    accfile.close() 

# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
# For any unresolved expressions, return a list of ACCESSION numbers
# for GenBank entries needed to resolve those expressions 
def EntriesNeeded(PreviousResult) :
    EntriesToBeRetrieved = []
    infile = open(PreviousResult,"r")
    for line in infile.readlines() :
        if line[0].startswith("@") : 
            accession = line[1:].split(":")[0]
    infile.close()
    return EntriesToBeRetrieved
      
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
"Wrapper class for command line parameters"
class Parameters:

    def __init__(self):
        """
                Initializes arguments:
                Then calls read_args() to fill in their values from command line

        """
        # GenBank feature keys for sequence features to be extracted
        self.FEATURES = []
        self.FEAFILE = ""     
        # LOCUS names or ACCESSION numbers of entries to be processed
        self.BASENAME = "" # base name to use for output files
        self.ACCESSION = True #False if ID is LOCUS, rather than ACCESSION
        self.IDFILE = "" # file of LOCUS names or ACCESSION numbers of entries to be processed
        self.ENTRIES = []

        # for u or U options, we are reading in feature expressions to evaluate into sequence
        self.EXPRESSIONS = []
        self.EXPRFILE = ""
        self.RESOLVE = 	False # Input contains one or more expressions that need to be resolved
        
        self.SEPARATE = False
        
        # True when the only parameter is a  single feature expression
        # No messages are printed. The only output is the sequence resulting from evaluating
        # the expression.
        self.QUIET = False 
        
        # Database from which to retrieve GenBank entries

        self.DBTYPE = "g" # source is GenBank (NCBI); -u or -U source is a local database subset
        self.DBFILE = "" 
        self.NEEDTOSPLIT = False # True if database is a GenBank entry that needs to be split into
                           #annotation, sequence and index
        self.ANOFILE = ""
        self.SEQFILE = ""
        self.INDFILE = ""
        self.OUTNAME = ""
        
        # temporary files
        self.PREFIX = "FEA" + "." + PID # prefix for temporary files
        self.TEMPFILES = []
        
        self.read_args()
        
        # Options for getob

        self.GETOBOPTIONS = ""
        if self.RESOLVE :
            self.GETOBOPTIONS = "-r"
        if self.ACCESSION :
            self.GETOBOPTIONS += " -c"        
                 
    def read_args(self):
        """
                Read command line arguments into a Paramters object
                """   
        # . . . . . . . . . . .ONE ARGUMENT FORM. . . . . . . . 
        #A FEATURES expression is given as the input, returning ONLY
        #the resultant sequence to the standard output.
        # Example: features.py @M81884:complement(join(70023..70028,1..69))
        if len(sys.argv) == 1 :
            print(USAGE)
        elif len(sys.argv) == 2 :
            if not sys.argv[0].startswith("-") :
                self.RESOLVE = True
                self.QUIET = True
                self.ACCESSION = True
                self.NEEDTOSPLIT = True
                expr = sys.argv[1]
                self.BASENAME = expr.split(":")[0] # use accession number as base name
                if self.BASENAME.startswith("@") :
                    self.BASENAME=self.BASENAME[1:]
                self.ANOFILE = self.BASENAME + ".ano"
                self.SEQFILE = self.BASENAME + ".seq"
                self.INDFILE = self.BASENAME + ".ind"     
                self.IDFILE = self.PREFIX+".ID"
                #Make sure the expression begins with '@'
                if not expr[0] == "@" :
                    expr = "@" + expr
                self.EXPRESSIONS = [expr] 
                WriteIdFile(self.IDFILE,self.EXPRESSIONS)
                # create a dummy infile for getob, with no features
                self.FEATURES = []
                self.FEAFILE = self.PREFIX+".inf"
                self.TEMPFILES.append(self.FEAFILE)
                self.TEMPFILES.append(self.IDFILE)
                WriteFeaFile(self.FEAFILE,self.FEATURES)
                
        # . . . . . . . . . . . . . .MULTIPLE ARGUMENT FORM . . . . . 
        #Feature keys, entries, expressions, and databases are
        #specified on the command line. Message, sequence and expression files are written.
        # Note: -g is a dummy option that is included here just to allow the user to specify 
        # that entries should be retrieved from NCBI, based on -n, -N, -a or -A. If neither
        # -u nor -U are set, then -g is assumed.
        else :
            parser = argparse.ArgumentParser()
            parser.add_argument("-f", dest="fea", action="store", default="", help="single feature")
            parser.add_argument("-F", dest="feafile", action="store", default="", help="read a file of features to search for")
            parser.add_argument("-n", dest="name", action="store", default="", help="LOCUS name")
            parser.add_argument("-N", dest="namefile", action="store", default="", help="read a file of LOCUS names")
            parser.add_argument("-a", dest="acc", action="store", default="",  help="ACCESSION number")
            parser.add_argument("-A", dest="accfile", action="store", default="", help="read a file of ACCESSION numbers")
            parser.add_argument("-e", dest="expr", action="store", default="", help="single FEATURE expression to be resolved")
            parser.add_argument("-E", dest="exprfile", action="store", default="", help="read a file of FEATURE expressions to be resolved")
            parser.add_argument("-g", dest="genbank", action="store_true", default=True, help="retrieve entries from NCBI GenBank")
            parser.add_argument("-u", dest="dbsubset", action="store", default="", help="process a subset of EXPRESSIONS from the database")
            parser.add_argument("-U", dest="dball", action="store", default="", help="process all EXPRESSIONS from the database")
            parser.add_argument("-o", dest="outname", action="store", default="", help="prefix for output files")
            parser.add_argument("-s", dest="separate", action="store_true", default=False, help="True: store each output object in a separate file")

            args = parser.parse_args()

            # Features read from command line or from a file
            if not args.fea == "" :
                 self.FEATURES = [args.fea]
                 self.RESOLVE = False
                 self.FEAFILE = self.PREFIX + ".inf"
                 WriteFeaFile(self.FEAFILE,self.FEATURES)
                 self.TEMPFILES.append(self.FEAFILE)
            elif not args.feafile == "" :
                 self.FEAFILE = args.feafile
                 self.FEATURES = Contents(args.feafile)
                 self.RESOLVE = False
                 self.FEAFILE = self.PREFIX + ".inf"
                 WriteFeaFile(self.FEAFILE,self.FEATURES) 
                 self.TEMPFILES.append(self.FEAFILE)                    

            # Read a list of names, accession numbers, or feature expressions
            # from command line or from a file
            if not args.name == "" :
                self.ENTRIES = [args.name]
                self.BASENAME = args.name # use LOCUS as base name
                self.IDFILE = self.PREFIX+".ID"
                self.TEMPFILES.append(self.IDFILE) 
                WriteIdFile(self.IDFILE,self.ENTRIES)
                self.ACCESSION = False  
                self.TEMPFILES.append(self.IDFILE)
            elif not args.acc == "" :
                self.ENTRIES = [args.acc]
                self.BASENAME = args.acc # use ACCESSION as base name
                self.IDFILE = self.PREFIX+".ID"
                self.TEMPFILES.append(self.IDFILE) 
                WriteIdFile(self.IDFILE,self.ENTRIES)
                self.ACCESSION = True
            elif not args.expr == "" :                                          
                self.RESOLVE = True
                self.ACCESSION = True
                self.NEEDTOSPLIT = True
                expr = args.expr
                self.BASENAME = expr.split(":")[0] # use accession number as base name
                if self.BASENAME.startswith("@") :
                    self.BASENAME=self.BASENAME[1:]
                #Make sure the expression begins with '@'
                if not expr[0] == "@" :
                    expr = "@" + expr 
                self.EXPRESSIONS = [expr]
                self.IDFILE = self.PREFIX+".ID"
                WriteIdFile(self.IDFILE,[expr])
                # create a dummy infile for getob, with no features
                self.FEATURES = []
                self.FEAFILE = self.PREFIX+".inf"
                self.TEMPFILES.append(self.FEAFILE)
                self.TEMPFILES.append(self.IDFILE)
                WriteFeaFile(self.FEAFILE,self.FEATURES)                                       
            elif not args.namefile == "" :
                 p = PurePath(args.namefile)
                 self.BASENAME = p.stem 
                 self.IDFILE = self.PREFIX+".ID"
                 shutil.copyfile(args.namefile,self.IDFILE)
                 self.ACCESSION = False
            elif not args.accfile == "" :
                 p = PurePath(args.accfile)
                 self.BASENAME = p.stem
                 self.IDFILE = self.BASENAME+".ID" 
                 shutil.copyfile(args.accfile,self.IDFILE)
                 self.TEMPFILES.append(self.IDFILE)
                 self.ACCESSION = True
            elif not args.exprfile == "" :                                          
                self.RESOLVE = True
                self.ACCESSION = True
                self.EXPRFILE = args.exprfile
                self.EXPRESSIONS = Contents(self.EXPRFILE)
                p = PurePath(self.EXPRFILE)
                self.BASENAME = p.stem 
                self.IDFILE = self.BASENAME+".ID"
                shutil.copyfile(args.exprfile,self.IDFILE)
                self.TEMPFILES.append(self.IDFILE)                                    
                # create a dummy infile for getob, with no features
                self.FEATURES = []
                self.FEAFILE = self.PREFIX+".inf"
                self.TEMPFILES.append(self.FEAFILE)
                WriteFeaFile(self.FEAFILE,self.FEATURES)  

            # Database or database subset
            if not args.dbsubset == "" :  
                self.DBTYPE = "u" # read a subset of entries, specified in IDFILE
                p = PurePath(args.dbsubset)
                self.BASENAME = p.stem
                extension = p.suffix
                if extension == ".gen" :
                    self.NEEDTOSPLIT = True
                else:
                    self.NEEDTOSPLIT = False        
            elif not args.dball == "" :
                self.DBTYPE = "U" # read all entries in the database
                p = PurePath(args.dball)
                self.BASENAME = p.stem
                extension = p.suffix
                if extension == ".gen" :
                    self.NEEDTOSPLIT = True
                else:
                    self.NEEDTOSPLIT = False  
                self.IDFILE = self.BASENAME+".ID"
            else :
                self.DBTYPE = "g" # retrieve entries from GenBank 
                self.NEEDTOSPLIT = True         

            if args.outname == "" :
                self.OUTNAME = self.BASENAME
            else :
                self.OUTNAME = args.outname
            self.ANOFILE = self.BASENAME + ".ano"
            self.SEQFILE = self.BASENAME + ".seq"
            self.INDFILE = self.BASENAME + ".ind"
            self.SEPARATE = args.separate

            # Make sure all necessary items are supplied on the command line
            if not self.RESOLVE and self.FEATURES == [] :
                print('features.py: >>>> Features must be specified.')
                quit()
            if self.DBTYPE == "u" :
                if Empty(self.IDFILE) :
                    print('features.py: >>>> Entries must be specified.')
                    quit()                                 

        if DEBUG and not self.QUIET :
            print("PREFIX: " + self.PREFIX)
            print("BASENAME: " + self.BASENAME)
            print("ACCESSION: " + str(self.ACCESSION))
            print("RESOLVE: " + str(self.RESOLVE))
            print("FEATURES: " + str(self.FEATURES))
            print("FEAFILE: " + self.FEAFILE)
            print("IDFILE: " + self.IDFILE)
            print("EXPRESSIONS: " + str(self.EXPRESSIONS))
            print("DBTYPE: " + self.DBTYPE)
            print("NEEDTOSPLIT: " + str(self.NEEDTOSPLIT))
            print("ANOFILE: " + self.ANOFILE)
            print("SEQFILE: " + self.SEQFILE)
            print("INDFILE: " + self.INDFILE) 
            print("OUTNAME: " + self.OUTNAME)
            print("SEPARATE: " + str(self.SEPARATE))           
            print("TEMPFILES: " + str(self.TEMPFILES))            


# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
# remove temporary files
def Cleanup(TEMPFILES) :
    for f in TEMPFILES :
        if os.path.isfile(f) :
            os.remove(f)
            
#======================== MAIN PROCEDURE ==========================
def main():
    """
        Called when not in documentation mode.
        """
    P = Parameters()

    # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
    # The first pass will usually resolve all feature expressions. 
    # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -     
    
    # Retrieve GenBank entries, if necessary
    if P.DBTYPE == "g" :
        if P.RESOLVE :
            #Unresolved expressions will begin with '@' in first column.
            #Find lines with unresolved expressions and extract the
            #accession numbers for use by fetch.
            fetchfile = "UNRESOLVED." + PID + ".acc"
            P.TEMPFILES.append(fetchfile)
            ff = open(fetchfile,"w")
            for line in Contents(P.IDFILE) :
                if line.startswith("@") :
                    # extract accession number, but leave off the version ie. after "."
                    out = line[1:].split(":")[0].split(".")[0] + NL
                    ff.write(out)
            ff.close()
        else:
            fetchfile = P.IDFILE
        if P.QUIET:
            p1 = subprocess.Popen(["seqfetch", "--infile", fetchfile, "--outfile", P.BASENAME+".gen"], stdout=subprocess.DEVNULL)
        else :
            p1 = subprocess.Popen(["seqfetch", "--infile", fetchfile, "--outfile", P.BASENAME+".gen"])  
        p1.wait()
        p2 = subprocess.Popen(["splitdb", P.BASENAME+".gen", P.ANOFILE, P.SEQFILE, P.INDFILE])
        p2.wait()
    
    #Extract features from the entries 
    if P.NEEDTOSPLIT :
        p3 = subprocess.Popen(["splitdb", P.BASENAME+".gen", P.ANOFILE, P.SEQFILE, P.INDFILE])
        p3.wait() 
    if P.DBTYPE == "U" :
        IndexToAccFile(P.INDFILE,P.IDFILE)
    if not Empty(P.IDFILE) and not Empty(P.ANOFILE) :
        if P.SEPARATE :
            P.GETOBOPTIONS += " -f"
            p6 = subprocess.Popen(["getob", P.GETOBOPTIONS, P.FEAFILE, P.IDFILE, P.ANOFILE, P.SEQFILE, P.INDFILE,
                    P.OUTNAME+".msg", P.OUTNAME+".out", P.OUTNAME+".exp"]) 
            p6.wait()             
        else:
            if len(sys.argv) == 2 :
               # p4 = subprocess.Popen(["getob", P.GETOBOPTIONS, P.FEAFILE, P.IDFILE, P.ANOFILE, P.SEQFILE, P.INDFILE,
               #     "/dev/null", "/dev/tty", "/dev/null"], stdout=subprocess.PIPE)
               # p4.wait()
               # p4 = subprocess.Popen(["getob", P.GETOBOPTIONS, P.FEAFILE, P.IDFILE, P.ANOFILE, P.SEQFILE, P.INDFILE,
               #     "/dev/null", "/dev/tty", "/dev/null"], stdout=subprocess.PIPE)
               # output = p4.communicate()
                p4 = subprocess.Popen(["getob", P.GETOBOPTIONS, P.FEAFILE, P.IDFILE, P.ANOFILE, P.SEQFILE, P.INDFILE,
                    "/dev/null", P.OUTNAME+".out", "/dev/null"])
                p4.wait()
                for line in Contents(P.OUTNAME+".out") :
                    print(line)
            else:
                p5 = subprocess.Popen(["getob", P.GETOBOPTIONS, P.FEAFILE, P.IDFILE, P.ANOFILE, P.SEQFILE, P.INDFILE,
                    P.OUTNAME+".msg", P.OUTNAME+".out", P.OUTNAME+".exp"]) 
                p5.wait()                                
    # Clean up a bit. Database files could be huge!
    if P.DBTYPE == "g" or P.NEEDTOSPLIT :
        os.remove(P.ANOFILE)
        os.remove(P.SEQFILE)
        os.remove(P.INDFILE)

    #Pull out all lines containing indirect references
    ENTRIES = EntriesNeeded(P.BASENAME+".out")
    # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -  
    # This part resolves any unresolved expressions that can be resolved.
    # Only in very rare cases will this loop execute more than once.
    TMPOPTIONS = "-c -r"
    TMPPREFIX = P.PREFIX+".tmp"
    TMPIDFN = TMPPREFIX+".acc"
    TMPGENBANKFN = TMPPREFIX+".gen"
    TMPANOFN = TMPPREFIX+".ano"
    TMPSEQFN = TMPPREFIX+".seq"
    TMPINDFN = TMPPREFIX+".ind" 
    TMPMSGFN = TMPPREFIX+".msg"   
    TMPEXPFN = TMPPREFIX+".exp"      

    # Create dummy Feature file for getob
    TMPFEAFN = TMPPREFIX+".feafile" 
    WriteFeaFile(TMPFEAFN,[])
    
    P.TEMPFILES += [TMPIDFN,TMPGENBANKFN,TMPANOFN,TMPSEQFN,TMPINDFN,TMPMSGFN,TMPEXPFN,TMPFEAFN]

    while len(ENTRIES) > 0 : 

        WriteIdFile(TMPIDFN,ENTRIES)
        p1 = subprocess.Popen(["seqfetch", "--infile", TMPIDFN, "--outfile", TMPGENBANKFN], stdout=subprocess.DEVNULL)
        p1.wait() 
        p3 = subprocess.Popen(["splitdb", TMPGENBANKFN, TMPANOFN, TMPSEQFN, TMPINDFN])
        p3.wait() 
                          
        p5 = subprocess.Popen(["getob", TMPOPTIONS, TMPFEAFN, P.OUTNAME+".out", TMPANOFN, TMPSEQFN, TMPINDFN,
            TMPPREFIX+".msg", TMPPREFIX+".out", TMPPRREFIX+".exp"]) 
        p5.wait() 

        # Replace previous .out file with the new .out file
        # Append contents of .msg and .exp files to previous files
        shutil.move(TMPPREFIX+".out", P.OUTNAME+".out")
        with open(P.OUTNAME+".msg", "a") as msgfile :
            with open(TMPMSGFN,"r") as tmpmsgfile : 
                for line in tmpmsgfile :
                    msgfile.write(line)
        with open(P.OUTNAME+".exp", "a") as expfile :
            with open(TMPEXPFN,"r") as tmpexpfile : 
                for line in tmpexpfile :
                    expfile.write(line)                    

        ENTRIES = EntriesNeeded(P.OUTNAME+".out")

    # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -     
    # Clean up    
    Cleanup(P.TEMPFILES)

if '-test' in sys.argv:
    pass
else:
    main()