#!/usr/bin/env python

import birchscript
import re
import os
import os.path
import shutil
import subprocess
import sys

# UDS - Update a database subset - Version 8/26/95

# The union of olddatabase and newdatabase is written to $NEW.$DBEXT.UPDATED
# Entries present in olddatabase are replaced by their equivalents from
# newdatabase.      

if len(sys.argv) < 3:
    print "Usage: uds olddatabase newdatabase"
    exit()

OLD   = os.path.splitext(sys.argv[1])[0]
NEW   = os.path.splitext(sys.argv[2])[0]
DBEXT = os.path.splitext(sys.argv[2])[1].lcase()

# Make sure databases are of the same type
if DBEXT != os.path.splitext(sys.argv[1])[1].lcase():
    print ">>> Databases not of same type"
    exit()

if DBEXT in (".gen", ".pir"):
    # set database option for use with getloc
    if DBEXT == ".gen":
        DBFLAG = "-g"
    elif DBEXT == ".pir":
        DBFLAG = "-p"

    # carry out operations in a temporary working directory
    STARTDIR = os.cwd()
    os.mkdir("UDS.tmp")
    os.chdir("UDS.tmp")
    subprocess.call(["splitdb", DBFLAG, os.path.join(STARTDIR, sys.argv[1]), OLD + ".ano", OLD + ".wrp", OLD + ".ind"])
    subprocess.call(["splitdb", DBFLAG, os.path.join(STARTDIR, sys.argv[2]), NEW + ".ano", NEW + ".wrp", NEW + ".ind"])


    # Find accession numbers that are in the old database subset but
    # not in the new. First, lists of accession numbers from the old and
    # new indexes are compared, and any accession numbers unique to the old
    # index are written to notfound.tmp. Next, each accession number from
    # notfound.tmp is searched for among the ACCESSION or #Accession lines
    # in the new database subset. If they are found, then it is assumed that
    # the old entry was merged into another entry. Any accession numbers not
    # found at this point are written to $OLD.unique
    
    # tr -s ' ' ' ' < $OLD.ind |cut -f2 -d" " | sort | uniq > $OLD.acc
    h_oldind = open(OLD + ".ind", "r")
    oldind = set()
    for line in h_oldind:
        oldind.add(re.replace(" +", " ", line).split()[1]).rstrip("\r\n") + "\n"
    oldind = list(oldind).sort()
    h_oldind.close()
    h_oldacc = open(OLD + ".acc", "w")
    h_oldacc.writelines(oldind)
    h_oldacc.close()
    
    #tr -s ' ' ' ' < $NEW.ind |cut -f2 -d" " | sort | uniq > $NEW.acc
    h_newind = open(NEW + ".ind", "r")
    newind = set()
    for line in h_newind:
        newind.add(re.replace(" +", " ", line).split()[1]).rstrip("\r\n") + "\n"
    newind = list(newind).sort()
    h_newind.close()
    h_newacc = open(NEW + ".acc", "w")
    h_newacc.writelines(newind)
    h_newacc.close()

    # fgrep -v -f $NEW.acc $OLD.acc > notfound.tmp
    h_notfound = open("notfound.tmp", "w")
    for acc in newind:
        if acc not in oldind:
            h_notfound.write(acc)
    h_notfound.close()

    h_nftmp = open("notfound.tmp", "r")
    h_olduniq = open(OLD + ".unique", "w")
    for ACNO in h_nftmp:
        # egrep -e $ACNO $NEW.acl > has_been_merged.tmp
        #if (-z has_been_merged.tmp) then
        matched = False
        h_newacl = open(NEW + ".acl")
        for line in h_newacl:
            if re.search(ACNO, line):
                matched = True
        h_newacl.close()
        if (matched):
            print >> h_olduniq, ACNO
    h_olduniq.close()
    h_nftmp.close()
 
    if os.path.exists(OLD + ".unique"):
        # Extract these unique entries as individual files in UDS.tmp
        subprocess.call(["getloc", DBFLAG, "-c", "-f", OLD + ".unique", OLD + ".ano", OLD + ".wrp", OLD + ".ind"])
    
        # Extract all of the entries from $NEW as individual files in UDS.tmp
        subprocess.call(["getloc", DBFLAG, "-f", NEW + ".ind", NEW + ".ano", NEW + ".wrp", NEW + ".ind"])

        #Rename files so that their names are LOCUS names, not ACCESSION
        # numbers. This will ensure that files get written in alphabetical
        # order by LOCUS name.

        # Translation of :  for file in os.listdir(os.getcwd)) foreach file (*.$DBEXT)
        # for any file in the directory, if the file is a "true-file" (not a subdirectory)
        # and the filename exists, then loop through it
        #
        # ls_dir contains all of the files in the current directory which have the extension DBEXT
        # LNAME LOOP
        ls_dir = [filename for filename in os.listdir(os.getcwd()) if os.path.isfile(filename) and os.path.splitext(filename)[1].lcase() == DBEXT]
        for file in ls_dir:
            h_file = open(file, "r")
            for line in h_file:
                if line.startswith("LOCUS"):
                    # LNAME = `grep LOCUS $file |tr -s  ' ' ' ' |cut -f2 -d" "`
                    LNAME = re.replace("LOCUS\s+([^\s]+)\s+.*", "\1")
                    break
            h_file.close()
            
            if file != LNAME + DBEXT:
                shutil.move(file, LNAME + DBEXT)

        # Write all of the entries collected in UDS.tmp into one file in
        # the original working directory
        UPDATED_DB_NAME = NEW + DBEXT + ".UPDATED"
        UPDATED_DB_PATH = os.path.join(STARTDIR, UPDATED_DB_NAME)

        print "Writing updated database to " + UPDATED_DB_NAME

        # ls_dir contains all of the files in the current directory which have the extension DBEXT
        # NOTE: this may not be the same list as above, because the above "LNAME" loop may change the filename
        # of some files with the extension DBEXT
        ls_dir = [filename for filename in os.listdir(os.getcwd()) if os.path.isfile(filename) and os.path.splitext(filename)[1].lcase() == DBEXT]

        if os.path.exists(UPDATED_DB_PATH):
            os.remove(UPDATED_DB_PATH)
        h_newdbupdated = open(UPDATED_DB_PATH, "w")
        for file in ls_dir:
            birchscript.cat_to(file, h_newdbupdated)
        h_newdbupdated.close()
    else:
        print "All accession numbers in $1 are also found in $2"
        print "No action taken."
    # Cleanup - remove temporary directory
    os.chdir(STARTDIR)
    shutil.rmtree("UDS.tmp")
else:
    print ">>> Unknown database file extension " + DBEXT