#!/usr/bin/env python import birchscript import re import os import os.path import shutil import subprocess import sys # UDS - Update a database subset - Version 8/26/95 # The union of olddatabase and newdatabase is written to $NEW.$DBEXT.UPDATED # Entries present in olddatabase are replaced by their equivalents from # newdatabase. if len(sys.argv) < 3: print "Usage: uds olddatabase newdatabase" exit() OLD = os.path.splitext(sys.argv[1])[0] NEW = os.path.splitext(sys.argv[2])[0] DBEXT = os.path.splitext(sys.argv[2])[1].lcase() # Make sure databases are of the same type if DBEXT != os.path.splitext(sys.argv[1])[1].lcase(): print ">>> Databases not of same type" exit() if DBEXT in (".gen", ".pir"): # set database option for use with getloc if DBEXT == ".gen": DBFLAG = "-g" elif DBEXT == ".pir": DBFLAG = "-p" # carry out operations in a temporary working directory STARTDIR = os.cwd() os.mkdir("UDS.tmp") os.chdir("UDS.tmp") subprocess.call(["splitdb", DBFLAG, os.path.join(STARTDIR, sys.argv[1]), OLD + ".ano", OLD + ".wrp", OLD + ".ind"]) subprocess.call(["splitdb", DBFLAG, os.path.join(STARTDIR, sys.argv[2]), NEW + ".ano", NEW + ".wrp", NEW + ".ind"]) # Find accession numbers that are in the old database subset but # not in the new. First, lists of accession numbers from the old and # new indexes are compared, and any accession numbers unique to the old # index are written to notfound.tmp. Next, each accession number from # notfound.tmp is searched for among the ACCESSION or #Accession lines # in the new database subset. If they are found, then it is assumed that # the old entry was merged into another entry. Any accession numbers not # found at this point are written to $OLD.unique # tr -s ' ' ' ' < $OLD.ind |cut -f2 -d" " | sort | uniq > $OLD.acc h_oldind = open(OLD + ".ind", "r") oldind = set() for line in h_oldind: oldind.add(re.replace(" +", " ", line).split()[1]).rstrip("\r\n") + "\n" oldind = list(oldind).sort() h_oldind.close() h_oldacc = open(OLD + ".acc", "w") h_oldacc.writelines(oldind) h_oldacc.close() #tr -s ' ' ' ' < $NEW.ind |cut -f2 -d" " | sort | uniq > $NEW.acc h_newind = open(NEW + ".ind", "r") newind = set() for line in h_newind: newind.add(re.replace(" +", " ", line).split()[1]).rstrip("\r\n") + "\n" newind = list(newind).sort() h_newind.close() h_newacc = open(NEW + ".acc", "w") h_newacc.writelines(newind) h_newacc.close() # fgrep -v -f $NEW.acc $OLD.acc > notfound.tmp h_notfound = open("notfound.tmp", "w") for acc in newind: if acc not in oldind: h_notfound.write(acc) h_notfound.close() h_nftmp = open("notfound.tmp", "r") h_olduniq = open(OLD + ".unique", "w") for ACNO in h_nftmp: # egrep -e $ACNO $NEW.acl > has_been_merged.tmp #if (-z has_been_merged.tmp) then matched = False h_newacl = open(NEW + ".acl") for line in h_newacl: if re.search(ACNO, line): matched = True h_newacl.close() if (matched): print >> h_olduniq, ACNO h_olduniq.close() h_nftmp.close() if os.path.exists(OLD + ".unique"): # Extract these unique entries as individual files in UDS.tmp subprocess.call(["getloc", DBFLAG, "-c", "-f", OLD + ".unique", OLD + ".ano", OLD + ".wrp", OLD + ".ind"]) # Extract all of the entries from $NEW as individual files in UDS.tmp subprocess.call(["getloc", DBFLAG, "-f", NEW + ".ind", NEW + ".ano", NEW + ".wrp", NEW + ".ind"]) #Rename files so that their names are LOCUS names, not ACCESSION # numbers. This will ensure that files get written in alphabetical # order by LOCUS name. # Translation of : for file in os.listdir(os.getcwd)) foreach file (*.$DBEXT) # for any file in the directory, if the file is a "true-file" (not a subdirectory) # and the filename exists, then loop through it # # ls_dir contains all of the files in the current directory which have the extension DBEXT # LNAME LOOP ls_dir = [filename for filename in os.listdir(os.getcwd()) if os.path.isfile(filename) and os.path.splitext(filename)[1].lcase() == DBEXT] for file in ls_dir: h_file = open(file, "r") for line in h_file: if line.startswith("LOCUS"): # LNAME = `grep LOCUS $file |tr -s ' ' ' ' |cut -f2 -d" "` LNAME = re.replace("LOCUS\s+([^\s]+)\s+.*", "\1") break h_file.close() if file != LNAME + DBEXT: shutil.move(file, LNAME + DBEXT) # Write all of the entries collected in UDS.tmp into one file in # the original working directory UPDATED_DB_NAME = NEW + DBEXT + ".UPDATED" UPDATED_DB_PATH = os.path.join(STARTDIR, UPDATED_DB_NAME) print "Writing updated database to " + UPDATED_DB_NAME # ls_dir contains all of the files in the current directory which have the extension DBEXT # NOTE: this may not be the same list as above, because the above "LNAME" loop may change the filename # of some files with the extension DBEXT ls_dir = [filename for filename in os.listdir(os.getcwd()) if os.path.isfile(filename) and os.path.splitext(filename)[1].lcase() == DBEXT] if os.path.exists(UPDATED_DB_PATH): os.remove(UPDATED_DB_PATH) h_newdbupdated = open(UPDATED_DB_PATH, "w") for file in ls_dir: birchscript.cat_to(file, h_newdbupdated) h_newdbupdated.close() else: print "All accession numbers in $1 are also found in $2" print "No action taken." # Cleanup - remove temporary directory os.chdir(STARTDIR) shutil.rmtree("UDS.tmp") else: print ">>> Unknown database file extension " + DBEXT