#!/usr/bin/env python #optparse is deprecated in favor of argparse as of Python 2.7. However, # since 2.7 is not always present on many systems, at this writing, # it is safer to stick with optparse for now. It should be easy # to change later, since the syntax is very similar between argparse and optparse. from optparse import OptionParser import ftplib import hashlib import time import datetime import os import os.path import re import stat import subprocess import sys import tarfile ''' BLupdate_blastdb.py - Set environment variables for BioLegato Helper Applications Synopsis: BLupdate_blastdb.py --configure --birchdir directory --blastdb directory BLupdate_blastdb.py --addfiles --ftpsite url --dblist db[,db] BLupdate_blastdb.py --deletefiles --dblist db[,db] BLupdate_blastdb.py --updatedb --ftpsite url --dblist db[,db] @modified: April 1, 2016 @author: Brian Fristensky @contact: frist@cc.umanitoba.ca ''' blib = os.environ.get("BIRCHPYLIB") sys.path.append(blib) from birchlib import Birchmod PROGRAM = "BLupdate_blastdb.py : " USAGE = "\n\tUSAGE: BLupdate_blastdb.py --configure --birchdir directory --blastdb directory" + \ "\n\t\tBLupdate_blastdb.py --reportlocal" + \ "\n\t\tBLupdate_blastdb.py --reportftp [--ftpsite url]" + \ "\n\t\tBLupdate_blastdb.py --addfiles --ftpsite url --dblist db[,db]" + \ "\n\t\tBLupdate_blastdb.py --deletefiles --dblist db[,db]" + \ "\n\t\tBLupdate_blastdb.py --updatedb --ftpsite url --dblist db[,db]" DEBUG = True if DEBUG : print('Debugging mode on') BM = Birchmod(PROGRAM, USAGE) BIRCHvariables = ['BIRCH_PROMPT','BLASTDB'] """ FTPINFO is a dictionary whose keys are ftp site addresses, and whose values are dictionaries of field names and their values: dbdir - directory in which the BLAST-formatted databases are found. UseMLSD - True or False. In order to use MLSD with FTP, the site must return the filename, size, and modification time. At this writing, only ftp.ncbi.nih.gov supports MLSD and returns all three values. ftp.ebi.ac.uk does not support MLSD. ftp.hgc.jp supports MLSD, but does not return the size of a file. Future: Possible additional fields might include timezone or geographical location of an FTP site. """ FTPINFO = {"ftp.ncbi.nih.gov" : {"dbdir":"/blast/db","UseMLSD":True} , "ftp.hgc.jp" : {"dbdir":"pub/mirror/ncbi/blast/db","UseMLSD":False}, "ftp.ebi.ac.uk" : {"dbdir":"pub/blast/db","UseMLSD":False} } #FTPINFO = {"ftp.ncbi.nih.gov" : "/blast/db" , "ftp.hgc.jp" : "pub/mirror/ncbi/blast/db", "mirrors.vbi.vt.edu" : "mirrors/ftp.ncbi.nih.gov/blast/db"} # for convenience, we create two lists of database names from local_dblist # This will be used for other methods that do things like checking for # the presence of databases etc. Importantly, DBNAMES_ALL has the database names # in the order in which they are read, which is not true in Python dictionaries. # When database lists are stored in dictionaries, we can access their components # in the original order by referencing DBNAMES_ALL. DBNAMES_INSTALLED lists those # databases that are currently installed. DBNAMES_ALL = [] DBNAMES_INSTALLED = [] # - - - - - - - - - - - - - Utility classes - - - - - - - - - - - - - - - - - def chmod_ar(filename): """ Make a file world-readable. """ if os.path.exists(filename): st = os.stat(filename) os.chmod(filename, st.st_mode | stat.S_IREAD \ | stat.S_IRGRP | stat.S_IROTH) def chmod_arx(filename): """ Make a file or directory world-readable and world-executable/searchable. """ if os.path.exists(filename): st = os.stat(filename) os.chmod(filename, st.st_mode | stat.S_IEXEC | stat.S_IREAD \ | stat.S_IXGRP | stat.S_IRGRP | stat.S_IXOTH \ | stat.S_IROTH) def LocalHostname(): """ Return the name of the local machine. Tries a number of methods to get a name other than 'localhost' or a null result. """ import socket import platform def CheckName(name) : if name == None or name.startswith("localhost") or name == "" : OKAY = False else : OKAY = True return OKAY name = os.getenv('HOSTNAME') if not CheckName(name) : name = platform.uname()[1] if not CheckName(name) : if socket.gethostname().find('.')>=0: name=socket.gethostname() else: name=socket.gethostbyaddr(socket.gethostname())[0] return name def GetBIRCHProperties(BIRCHDIR,PropName) : """ Retrieve a value from BIRCH.properties. eg. To retrieve the value of BirchProps.adminEmail: GetBIRCHProperties(BIRCHDIR,"adminEmail") """ PFN = os.path.join(BIRCHDIR , 'local' , 'admin' , 'BIRCH.properties') pfile = open(PFN,'r') Value = "" Target = 'BirchProps.' + PropName lines = pfile.readlines() pfile.close() plen = len(lines) if plen > 0 : i = 0 while (i < plen) and (Value == "") : line = lines[i] # ignore blank lines and comment lines if not (line.startswith('#')) : tokens = line.split("=") if tokens[0] == Target : Value = tokens[1].strip() i += 1 return Value def DeleteFilesByPrefix(dest,dbname,LOGFILE) : """ Delete all files from dest whose names begin with dbname """ Indent4 = ' ' os.chdir(dest) rawlist = os.listdir(dest) rawlist.sort() for filename in rawlist : prefix = filename.split('.')[0] if prefix == dbname : os.remove(filename) LOGFILE.write(Indent4 + filename + ' deleted\n') def SendEmail(From,To,Subject,Text) : """ Very simple email method adapted from: http://stackoverflow.com/questions/882712/sending-html-email-using-python There are more elaborate examples on this site for sending HTML messages and attachments. """ import smtplib from email.mime.multipart import MIMEMultipart from email.mime.text import MIMEText Host = 'localhost' msg = MIMEMultipart('alternative') msg['Subject'] = Subject Html = """\

%s

""" %(Text) part1 = MIMEText(Text, 'plain') part2 = MIMEText(Html, 'html') msg.attach(part1) msg.attach(part2) try: server = smtplib.SMTP(Host) server.sendmail(From, To, msg.as_string()) server.quit() print "Successfully sent email" except : print "Error: unable to send email" def getValue(tokens,kw) : """ Return a value for a key/value pair, where = is the separator. """ i = 0 L = len(tokens) FOUND = False retval = "" while i < L and not FOUND : t = tokens[i].split('=') if t[0] == kw : FOUND = True retval = t[1] i += 1 return retval def download_file(FTPSITE,FTPDIR,FN): """ Download a file from an FTP site. """ ftp = ftplib.FTP(FTPSITE) ftp.login() ftp.cwd(FTPDIR) Outfile = open(FN, 'wb') ftp.retrbinary('RETR ' + FN, Outfile.write) Outfile.close() ftp.quit() def md5Checksum(filePath): """ Calculate an md5 checksum for a file. Writen by Joel Verhagen http://joelverhagen.com/blog/2011/02/md5-hash-of-file-in-python/ """ with open(filePath, 'rb') as fh: m = hashlib.md5() while True: data = fh.read(8192) if not data: break m.update(data) return m.hexdigest() def extracttgz(fname): """ Extract all files from a .tar.gz file Adapted from: https://sukhbinder.wordpress.com/2014/03/06/untar-a-tar-file-with-python/ """ if (fname.endswith("tar.gz")): tar = tarfile.open(fname) tar.extractall() tar.close() def InstallFile(tgzfile,FTPSITE,FTPDIR,LOGFILE) : """ Download a BLAST .tar.gz and its corresponding .md5 file from the FTP site, and untar the file in in the current directory. Delete the .tar.gz file when done, but keep the .md5 file. """ SUCCESS=False MAX_DOWNLOAD_ATTEMPTS=3 ATTEMPTS = 0 md5file = tgzfile + '.md5' Indent4 = ' ' while not SUCCESS and (ATTEMPTS < MAX_DOWNLOAD_ATTEMPTS) : #Download .tar.gz file and .tar.gz.md5 file try : download_file(FTPSITE,FTPDIR,tgzfile) except : LOGFILE.write(Indent4 + 'Failed to download ' + tgzfile + '\n') try : download_file(FTPSITE,FTPDIR,md5file) except : LOGFILE.write(Indent4 + 'Failed to download ' + md5file + '\n') # Calculate md5 checksum, and compare it with the checksum file if os.path.exists(tgzfile) and os.path.exists(md5file) : LocalChecksum = md5Checksum(tgzfile) with open(md5file, 'rb') as fh : line = fh.readline() RemoteChecksum = line.split(" ")[0] if LocalChecksum == RemoteChecksum : SUCCESS = True else : LOGFILE.write(Indent4 + '>>> Checksum for ' + tgzfile + ' does not match ' + md5file + '\n') ATTEMPTS += 1 # Uncompress the .tar.gz file if SUCCESS : try : extracttgz(tgzfile) except : SUCCESS = False LOGFILE.write(Indent4 + '>>> Error extracting files from ' + tgzfile + '\n') if SUCCESS : os.remove(tgzfile) return SUCCESS # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - class FileMetaData: """ Holds metadata for a file. """ def __init__(self): """ Initializes arguments: Name = "" Size = 0 MDate = datetime.time() """ self.Name = "" self.Size = 0 now = datetime.datetime.now() self.CurrentYear = now.year self.Mtime = now def getLocalMetaData(self,FN) : """ Retrieve metadata for a local file. """ finfo = os.stat(FN) self.Name = FN self.Size = int(finfo.st_size) timestamp = finfo.st_mtime self.Mtime = datetime.datetime.fromtimestamp(timestamp) #print(self.Name + ' ' + str(self.Size) + ' ' + str(self.Mtime) ) def getFTPMetaData(self,line,UseMLSD) : """ Parse metadata for a file on a remote FTP site. """ if UseMLSD : tokens = line.split(';') numtokens = len(tokens) self.Name = tokens[numtokens-1].lstrip() timestruct = getValue(tokens,'modify') self.Mtime = datetime.datetime.strptime(timestruct,"%Y%m%d%H%M%S") self.Size = int(getValue(tokens,'size')) else: """ Parsing directory lines is hazardous! Directory listings from a number of FTP sites seem to all be consistent with tokenizing into nine non-blank fields: Field Content ----------------------- 0 permissions 1 ? 2 owner 3 group 4 size 5 Month 6 Day 7 Time or Year 8 Name Field 8, is a special case, because a name might include blanks. For the purposes of downloading NCBI files, we don't need to worry about blanks in names. """ # Parse directory lines into non-blank fields # When None is used as the separator, multiple seperators are # parsed as a single seperator. Neat! tokens = line.split(None) #print(line) L = len(tokens) if L == 9 : # Name and Size are easy self.Name = tokens[8] self.Size = int(tokens[4]) # Modification time takes a bit more work. Month = tokens[5] Day = tokens[6] if ':' in tokens[7] : #directory listings for files from the previous 12 months will not list a # year, only a modification time. The year will either be the current year # or the previous year. Year = str(self.CurrentYear) TimeHM = tokens[7] else : Year = tokens[7] TimeHM = "00:00" TimeStr = Day + ' ' + Month + ' ' + Year + ' ' + TimeHM #self.Mtime = time.strptime(TimeStr, "%d %b %Y %H:%M") self.Mtime = datetime.datetime.strptime(TimeStr, "%d %b %Y %H:%M") #directory listings for files from the previous 12 months will not list a # year, only a modification time. The year will either be the current year # or the previous year. The result would be a time in the future. If that # happens, we have to decrement the Year by 1 and recalculate the Mtime. if self.Mtime > datetime.datetime.now() : Year = str(self.CurrentYear-1) TimeStr = Day + ' ' + Month + ' ' + Year + ' ' + TimeHM self.Mtime = datetime.datetime.strptime(TimeStr, "%d %b %Y %H:%M") #print(self.Name + ' ' + str(self.Size) + ' ' +str(self.Mtime)) # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - class Parameters: """ Wrapper class for command line parameters """ def __init__(self): """ Initializes arguments: CONFIGURE = False BIRCHDIR = "" BLASTDB = "" REPORTLOCAL = False REPORTFTP = False ADDFILES = False DELETEFILES = False UPDATEDB = False FTPSITE = "" DBLIST = [] SFN= "" Then calls read_args() to fill in their values from command line """ self.CONFIGURE = False self.BIRCHDIR = "" self.BLASTDB = "" self.REPORTLOCAL = False self.REPORTFTP = False self.ADDFILES = False self.DELETEFILES = False self.UPDATEDB = False self.FTPSITE = "" self.DBLIST = [] self.read_args() self.SFN = os.path.join(self.BIRCHDIR , 'local' , 'admin' , 'BIRCH.settings') if DEBUG : print('------------ Parameters from command line ------') print(' CONFIGURE: ' + str(self.CONFIGURE)) print(' BIRCHDIR: ' + self.BIRCHDIR) print(' BLASTDB: ' + self.BLASTDB) print(' REPORTLOCAL: ' + str(self.REPORTLOCAL)) print(' REPORTFTP: ' + str(self.REPORTFTP)) print(' ADDFILES: ' + str(self.ADDFILES)) print(' DELETEFILES: ' + str(self.DELETEFILES)) print(' UPDATEDB: ' + str(self.UPDATEDB)) print(' FTPSITE: ' + self.FTPSITE) print(' DBLIST: ' + str(self.DBLIST)) print(' Settings file: ' + self.SFN) print() def read_args(self): """ Read command line arguments into a Parameter object """ parser = OptionParser() parser.add_option("--configure", dest="configure", action="store_true", default=False, help="in a new install or update, set BIRCHDB environment variable") parser.add_option("--birchdir", dest="birchdir", action="store", default="", help="path to BIRCH installation directory") parser.add_option("--blastdb", dest="blastdb", action="store", default="", help="path to Blast Database directory") parser.add_option("--reportlocal", dest="reportlocal", action="store_true", default=False, help="Write local database stats to $BLASTDB/localstats.tsv") parser.add_option("--reportftp", dest="reportftp", action="store_true", default=False, help="Write database stats from remote FTP site to $BLASTDB/ftpstats.tsv") parser.add_option("--addfiles", dest="addfiles", action="store_true", default=False, help="add files to Blast database") parser.add_option("--deletefiles", dest="deletefiles", action="store_true", default=False, help="delete files from Blast database") parser.add_option("--updatedb", dest="updatedb", action="store_true", default=False, help="download and install updates for Blast Database") parser.add_option("--ftpsite", dest="ftpsite", action="store", default="", help="FTP site from which to download update files") parser.add_option("--dblist", dest="rawdblist", action="store", default="", help="list of database files to add, delete or update") (options, args) = parser.parse_args() self.CONFIGURE = options.configure self.BIRCHDIR = options.birchdir if self.BIRCHDIR == "" : self.BIRCHDIR = str(os.environ['BIRCH']) self.BLASTDB = options.blastdb if self.BLASTDB == "" : if "BLASTDB" in os.environ : self.BLASTDB = str(os.environ['BLASTDB']) else : self.BLASTDB = os.path.join(self.BIRCHDIR, 'GenBank') self.REPORTLOCAL = options.reportlocal self.REPORTFTP = options.reportftp self.ADDFILES = options.addfiles self.DELETEFILES = options.deletefiles self.UPDATEDB = options.updatedb self.FTPSITE = options.ftpsite # We need to deal with the possibility that the user has prefixed the URL # with a protocol. The protocol will be prepended later if we need it. if self.FTPSITE.startswith('http://') : self.FTPSITE = self.FTPSITE[7:] elif self.FTPSITE.startswith('https://') : self.FTPSITE = self.FTPSITE[8:] if self.FTPSITE == "" : self.FTPSITE = "ftp.ncbi.nih.gov" if options.rawdblist != "" : tokens = options.rawdblist.split(",") # When BioLegato calls BLupdate_blastdb.py, the comma-separated list in --dblist will # usually contain empty elements. We need to remove those empty components from dblist # Evaluating a list comprehension is the most straightforward way. self.DBLIST = [x for x in tokens if x != ""] # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - class BIRCHSettings: """ Data and methods for the BIRCH Settings file. """ def __init__(self,P): """ Initializes arguments: dict = {} """ self.dict = {} for var in BIRCHvariables: self.dict[var] = "" if os.path.exists(P.SFN) : self.ReadBIRCHSettings(P.SFN) #else: # DFN = os.path.join(self.BIRCH , 'admin' , 'BIRCH.settings.default') # self.ReadBIRCHSettings(DFN) if DEBUG : print('- - - - - BIRCH Settings - - - - -') for k in self.dict : print(' ' + k + ',' + self.dict[k]) def ReadBIRCHSettings(self,FN): """ Read current values of BIRCHvariables from BIRCH.settings. """ if os.path.exists(FN) : Sfile = open(FN,'r') for line in Sfile : line = line.strip() # ignore blank lines and comment lines if (line != "" and line[0] != '#') : tokens = line.split("=") if tokens[0] in BIRCHvariables : self.dict[tokens[0]] = tokens[1] Sfile.close() def WriteBIRCHSettings(self,SFN): """ Write current values of BIRCHvariables to BL.properties. file. """ Sfile = open(SFN,'w') Sfile.write('# DO NOT EDIT THIS FILE!\n') Sfile.write('# This file is automatically generated by BLupdate_blastdb.py during installation,\n') Sfile.write('# update or by birchadmin --> Preferences --> Settings\n') for k in self.dict : Sfile.write(k + '=' + self.dict[k] + '\n') Sfile.close() def WriteBIRCHenvBourne(self,P): """ Write bash code for setting BIRCHvariables to birch_settings_Bourne.source. Used for Bourne type shells eg. bash, sh """ ENVFN = os.path.join(P.BIRCHDIR, 'admin', 'birch_settings_Bourne' + '.source') Sfile = open(ENVFN,'w') Sfile.write('# DO NOT EDIT THIS FILE!\n') Sfile.write('# This file is automatically generated by BLupdate_blastdb.py during installation,\n') Sfile.write('# update or by birchadmin --> BIRCHSettings\n') #Enclose value of argument in single quotes. This is maninly for cases such as #BL_Terminal='gnome-terminal -e' for k in self.dict : Sfile.write(k + "='" + self.dict[k] + "'\n") Sfile.write('export ') for var in BIRCHvariables : Sfile.write(' ' + var) Sfile.write('\n') Sfile.close() chmod_ar(ENVFN) def WriteBIRCHenvCsh(self,P): """ Write csh code for setting BIRCHvariables to birch_settings_csh.source. Used for C type shells eg. csh, tcsh """ ENVFN = os.path.join(P.BIRCHDIR, 'admin', 'birch_settings_csh' + '.source') Sfile = open(ENVFN,'w') Sfile.write('# DO NOT EDIT THIS FILE!\n') Sfile.write('# This file is automatically generated by BLupdate_blastdb.py during installation,\n') Sfile.write('# update or by birchadmin --> BIRCHSettings\n') for k in self.dict : Sfile.write('setenv ' +k + ' ' + self.dict[k] + '\n') Sfile.close() chmod_ar(ENVFN) # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - class BLASTDBList: """ Data and methods for the BLAST databases. """ def __init__(self,P,DBNAMES_ALL): """ Initializes local BLASTDB list """ # Read in the default BLASTDB.list. self.default_listfile = os.path.join(P.BIRCHDIR , 'admin' , 'Settings.default' , 'BLASTDB.list') self.default_dblist = self.ReadBLASTDBList(self.default_listfile,DBNAMES_ALL) # Read the local settings file if it exists. Update field 0, the human-readable # description of the database, with the value from the default dblist. This # updates the local dblist if there are changes in this field in a new BIRCH release. # Also, if a new release has db files not in the local list, add those to the # local list. self.local_listfile = os.path.join(P.BIRCHDIR , 'local' , 'admin' , 'BLASTDB.list') if os.path.exists(self.local_listfile) : self.local_dblist = self.ReadBLASTDBList(self.local_listfile,DBNAMES_ALL) for dbname in self.default_dblist : if dbname in self.local_dblist : self.local_dblist[dbname][0] = self.default_dblist[dbname][0] else: self.local_dblist[dbname] = self.default_dblist[dbname] else: self.local_dblist = self.default_dblist # dictionaries of metadata for files in BLASTDB directory and on the # remote FTP site self.localDBFILES = {} # Initialize dictionaries for each database name self.remoteDBFILES = {} for dbname in DBNAMES_ALL : self.remoteDBFILES[dbname] = {} def ReadBLASTDBList(self,FN,DBNAMES_ALL): """ Read list of BLAST databases, descriptions and install status (0,1). """ BLASTDICT={} Bfile = open(FN,'r') for line in Bfile : if (line != "" and line[0] != '#') : line = line.strip() tokens = line.split(",") if len(tokens) == 4 : # ignore blank lines BLASTDICT[tokens[0]]=[tokens[1],tokens[2],tokens[3]] if not tokens[0] in DBNAMES_ALL : DBNAMES_ALL.append(tokens[0]) Bfile.close() if DEBUG : print('- - - - - BLASTDB list from ' + FN + '- - - - -') for k in DBNAMES_ALL : print(' ' + k + ',' + str(BLASTDICT[k])) return BLASTDICT def WriteBLASTDBList(self,DBNAMES_ALL): """ Write list of BLAST databases, descriptions and install status (0,1). """ Bfile = open(self.local_listfile,'w') Bfile.write('# DO NOT EDIT THIS FILE!\n') Bfile.write('# This file is automatically generated by BLupdate_blastdb.py during installation,\n') Bfile.write('# update or by birchadmin --> BLASTDB Configure\n') for dbname in DBNAMES_ALL : Bfile.write(dbname + ',' + self.local_dblist[dbname][0] + ',' + str(self.local_dblist[dbname][1]) + ',' + str(self.local_dblist[dbname][2]) + '\n') Bfile.close() chmod_ar(self.local_listfile) def CheckBLASTDB(self,BLASTDB,DBNAMES_ALL) : """ Get a list of databases found in the BLASTDB directory. There are potentially a lot of ways this might be done, and no obvious best choice. Here, we just look to make sure that each database name is found, regardless of how many files share that name eg. nt includes files with names like nt.01.* """ # First, create a dictionary of filenames for all databases # found in $BLASTDB os.chdir(BLASTDB) rawlist = os.listdir(os.getcwd()) for filename in rawlist : tokens = filename.split('.') prefix0 = tokens[0] # eg. nt # We only want BLAST files; ignore all other files if prefix0 in DBNAMES_ALL : F = FileMetaData() F.getLocalMetaData(filename) # subsequent occurrences of files in a database set if prefix0 in self.localDBFILES : self.localDBFILES[prefix0][filename] = F # first occurrence of a file in a database set else : self.localDBFILES[prefix0] = {filename:F} # Next, iterate through the database names, and set the # +/- field depending on whether a database name appears in # the list of files. Here is also where we create DBNAMES_INSTALLED, # which lists installed databases in the order in which they # appear in the db .list files. DBNAMES_INSTALLED = [] for dbname in DBNAMES_ALL : if dbname in self.localDBFILES : self.local_dblist[dbname][2] = 1 DBNAMES_INSTALLED.append(dbname) else : self.local_dblist[dbname][2] = 0 return DBNAMES_INSTALLED def CheckFTPsite(self,FTPSITE,DBNAMES_ALL) : """ Get a list of database files found at the remote FTP site. """ # First, create a dictionary of filenames for all databases # found in FTPSITE/FTPDIR print('Checking ' + FTPSITE) FTPDIR = FTPINFO[FTPSITE]["dbdir"] ftp = ftplib.FTP(FTPSITE) ftp.login() ftp.cwd(FTPDIR) dirlines = [] if FTPINFO[FTPSITE]["UseMLSD"] : ftp.retrlines('MLSD',dirlines.append) # Python 3 - use ftp.mlsd else: ftp.retrlines('LIST',dirlines.append) ftp.quit() for line in dirlines: F = FileMetaData() F.getFTPMetaData(line,FTPINFO[FTPSITE]["UseMLSD"]) if F.Name.endswith('.md5') : tgzname = F.Name[:-4] else : # eg. nt.00.tar.gz tgzname = F.Name tokens = F.Name.split('.') dbname = tokens[0] # eg. nt # We only want BLAST files; ignore all other files if dbname in DBNAMES_ALL : # subsequent occurrences of files in a database set # each section of the database (eg. nt.20.tar.gz has two files: nt.20.tar.gz and nt.20.tar.gz.md5 # We store these fils in a dictionary whose key is the name of the tar.gz file eg. nt.20.tar.gz if tgzname in self.remoteDBFILES[dbname] : self.remoteDBFILES[dbname][tgzname][tgzname].append(F) # first occurrence of a file in a database set else : self.remoteDBFILES[dbname][tgzname] = {tgzname:[F]} def WriteLocalReport(self,BLASTDB,DBNAMES_INSTALLED): """ Write a spreadsheet-ready report with statistics on the local copy of the NCBI databases. The report is a tab-separated value file written to $BLASTDB/localstats.tsv. """ TAB = "\t" OFN = os.path.join(BLASTDB, 'localstats.tsv') LRfile = open(OFN,'w') LRfile.write('BLupdate_blastdb.py:' + TAB + 'LOCAL BLAST DATABASE REPORT\n') LRfile.write('\n') LRfile.write('Database Directory:' + TAB + BLASTDB + '\n') LRfile.write('\n') LRfile.write('DB name:' + TAB + 'size (Mb)' + TAB + 'Last Update' + '\n') row = 6 starting_row = row # earlydate is the time of the most recent file in a database segment eg. nt, nr etc. # we initialize it to a ridiculously early date timestruct = '1970-01-01 00:00:00' earlydate = datetime.datetime.strptime(timestruct,"%Y-%m-%d %H:%M:%S") for dbname in DBNAMES_INSTALLED : subtotal = 0 MostRecent = earlydate for file in self.localDBFILES[dbname] : subtotal = subtotal + self.localDBFILES[dbname][file].Size if MostRecent < self.localDBFILES[dbname][file].Mtime : MostRecent = self.localDBFILES[dbname][file].Mtime sMostRecent = MostRecent.strftime("%Y-%m-%d %H:%M") LRfile.write(dbname + TAB + str(subtotal/1000000) + TAB + sMostRecent + "\n") last_row = row row += 1 # We dont' calculate a total. We insert a formula that lets the spreadsheet calculate the total. LRfile.write('TOTAL:' + TAB + '=SUM(B' + str(starting_row) + ':B' + str(last_row) + ")\n") LRfile.close() chmod_ar(OFN) def WriteFTPReport(self,BLASTDB,FTPSITE,DBNAMES_ALL): """ Write a spreadsheet-ready report with statistics on the remote NCBI databases. The report is a tab-separated value file written to $BLASTDB/ftpstats.tsv. """ TAB = "\t" OFN = os.path.join(BLASTDB, 'ftpstats.tsv') LRfile = open(OFN,'w') LRfile.write('BLupdate_blastdb.py:' + TAB + 'REMOTE FTP BLAST DATABASE REPORT\n') LRfile.write('\n') LRfile.write('FTP site:' + TAB + FTPSITE + '\n') LRfile.write('Database Directory:' + TAB + FTPINFO[FTPSITE]["dbdir"] + '\n') LRfile.write('\n') LRfile.write('DB name:' + TAB + 'compressed size (Mbytes)' + TAB + 'Modification Time' + '\n') row = 7 starting_row = row # earlydate is the time of the most recent file in a database segment eg. nt, nr etc. # we initialize it to a ridiculously early date timestruct = '1970-01-01 00:00:00' earlydate = datetime.datetime.strptime(timestruct,"%Y-%m-%d %H:%M:%S") for dbname in DBNAMES_ALL : subtotal = 0 MostRecent = earlydate for file in self.remoteDBFILES[dbname] : for F in self.remoteDBFILES[dbname][file][file] : subtotal = subtotal + F.Size if MostRecent < F.Mtime : MostRecent = F.Mtime sMostRecent = MostRecent.strftime("%Y-%m-%d %H:%M") LRfile.write(dbname + TAB + str(subtotal/1000000) + TAB + sMostRecent + '\n') last_row = row row += 1 # We dont' calculate a total. We insert a formula that calculates the total. LRfile.write('TOTAL:' + TAB + '=SUM(B' + str(starting_row) + ':B' + str(last_row) + ')\n') LRfile.close() chmod_ar(OFN) def FindNewFiles(self,DBLIST) : """ Return a list of database files .tar.gz files that are newer on the server than those in the local directory. In practice, this means that if the local *.tar.gz.md5 file is older than the *.tar.gz.md5 file on the remote site, we download the newer *.tar.gz files from the remote site. """ # initialize NewFiles dictionary NewFiles = {} for dbname in DBLIST: NewFiles[dbname] = [] for tgzname in self.remoteDBFILES[dbname] : #print(tgzname) for F in self.remoteDBFILES[dbname][tgzname][tgzname] : if F.Name.endswith(".md5") : NewDownload = False UpdateAvailable = False if not dbname in self.localDBFILES : NewDownload = True elif not F.Name in self.localDBFILES[dbname] : NewDownload = True elif self.localDBFILES[dbname][F.Name].Mtime < F.Mtime : UpdateAvailable = True if NewDownload or UpdateAvailable : print('New: ' + F.Name) NewFiles[dbname].append(F.Name[:-4]) NewFiles[dbname].sort() return NewFiles def CreateFastaNamefiles(self,BIRCHDIR,BLASTDB,DBNAMES_INSTALLED) : """ Create .nam files so that FASTA can find BLAST databases. """ FastaDirName = os.path.join(BIRCHDIR , 'dat' , 'fasta') os.chdir(FastaDirName) FileHeader = '<${GB}' #first line in all .nam files for dbname in DBNAMES_INSTALLED : # create a file with names of all .psq files (protein) or .nsq files (nucleotide) # for a given database subset NameFile = open(dbname + '.nam','w') NameFile.write(FileHeader + '\n') if self.local_dblist[dbname][1] in ["n","N"] : #nucleic acids seq file ext = ".nsq" elif self.local_dblist[dbname][1] in ["p","P"] : #protein seq file ext = ".psq" # Write the file names out to the .nam file. # Because dictionaries are not sorted, we need to first # get a list of keys and sort them. In python2 we could use # the keys() function, but in python3 keys() returns a view object, # rather than a list. So we have to force the result into a list # and then sort it. filenames = list(self.localDBFILES[dbname].keys()) filenames.sort() for fn in filenames : F = self.localDBFILES[dbname][fn] if F.Name.endswith(ext) : prefix = F.Name[:-4] NameFile.write(prefix + ' 12\n') NameFile.close() def CreateBLMenu(self,DBNAMES,Directory,TNAME) : """ Create a BioLegato .blmenu file from a template file """ # Test for End of line condition in input def EOF(line) : if line == "" : result = True else : result = False return result def ReadChooser(DONE) : """ Read a template file to be used for creating chooser variables. A varible in the final .blmenu file will be implemented for each database in the list. Returns a list of lines. """ Chooser = [] line = Templatefile.readline() DONE = EOF(line) while not line.startswith('') and not EOF(line) : Chooser.append(line) line = Templatefile.readline() return Chooser def WriteChooser(ChooserTemplate,DB,OutputFile) : """ Write a chooser variable to the output file for a given database section. We do this by substituting markup tags from the template with desired values. Another way to have done this would have been to just have this method write out the complete PCD for each chooser. However, that approach has the disadvantage that even minor changes in the PCD have to be made in the Python code, rather than in the PCD template. """ for line in ChooserTemplate : outputline = line.replace('',DB) outputline = outputline.replace('',self.local_dblist[db][0]) if self.local_dblist[db][2] == 1 : InstSymbol = '+' else : InstSymbol = '-' outputline = outputline.replace('',InstSymbol) outputline = outputline.replace('',str(self.local_dblist[db][2])) OutputFile.write(outputline) TemplateFN = os.path.join(Directory, TNAME + '.blmenu.template') OutputFN = os.path.join(Directory, TNAME + '.blmenu') Templatefile = open(TemplateFN,'r') OutputFile = open(OutputFN,'w') OutputFile.write('# DO NOT EDIT THIS FILE!\n') OutputFile.write('# This file is automatically generated by BLupdate_blastdb.py during installation,\n') OutputFile.write('# update or by birchadmin --> UpdateAddInstall\n') line = Templatefile.readline() DONE = EOF(line) dblist = "" #used for the --dblist option while not DONE : if line.startswith('') > -1 : line = line.replace('',dblist) OutputFile.write(line) else : OutputFile.write(line) if not DONE : line = Templatefile.readline() DONE = EOF(line) Templatefile.close() OutputFile.close() chmod_ar(OutputFN) def UpdateFiles(self,P,DBNAMES_INSTALLED,DBNAMES_ALL,LOGFILE) : """ Cycle through the names of databases in the order given in UpdateList. If the name is in the list to be updated, first delete the files for that name to create some space. Next run blastdb_update.pl to download and install the new files. When all files have been downloaded, send an email to the BIRCH administrator. """ Indent4 = ' ' NewFiles = self.FindNewFiles(P.DBLIST) for dbname in P.DBLIST : #print(dbname) if len(NewFiles[dbname]) == 0 : LOGFILE.write(Indent4 + Indent4 + dbname + ' up to date. Nothing to install.' + '\n') else : LOGFILE.write(Indent4 + '----- Updating ' + dbname + ' -----\n') for file in NewFiles[dbname] : FTPDIR = FTPINFO[P.FTPSITE]["dbdir"] SUCCESS = InstallFile(file,P.FTPSITE,FTPDIR,LOGFILE) if SUCCESS : LOGFILE.write(Indent4 + Indent4 + 'Successfully installed ' + file + '\n') else : LOGFILE.write(Indent4 + Indent4 + '>>> INSTALL OF ' + file + ' FAILED\n') LOGFILE.write('\n') def AddFiles(self,P,DBNAMES_INSTALLED,DBNAMES_ALL,LOGFILE) : """ Cycle through the names of databases in the order given in UpdateList. """ Indent4 = ' ' for dbname in P.DBLIST : if dbname in DBNAMES_INSTALLED : LOGFILE.write(Indent4 + dbname + ' already installed\n') else : LOGFILE.write(Indent4 + '----- Adding ' + dbname + ' -----\n') FilesToGet = [] for file in self.remoteDBFILES[dbname] : #print(file) FilesToGet.append(file) FilesToGet.sort() for file in FilesToGet : FTPDIR = FTPINFO[P.FTPSITE]["dbdir"] SUCCESS = InstallFile(file,P.FTPSITE,FTPDIR,LOGFILE) if SUCCESS : LOGFILE.write(Indent4 + Indent4 + 'Successfully installed ' + file + '\n') else : LOGFILE.write(Indent4 + Indent4 + '>>> INSTALL OF ' + file + ' FAILED\n') LOGFILE.write('\n') def Revise(self,P,DBNAMES_INSTALLED,DBNAMES_ALL) : """ Revise local database metadata, BioLegato menus and FASTA .nam files """ DBNAMES_INSTALLED = self.CheckBLASTDB(P.BLASTDB,DBNAMES_ALL) if DEBUG : print('Installed: ' + str(DBNAMES_INSTALLED)) self.WriteBLASTDBList(DBNAMES_ALL) # Write BioLegato menus for birchadmin Directory = os.path.join(P.BIRCHDIR, 'dat', 'birchadmin', 'PCD', 'UpdateAddInstall') for menu in ['BlastDBUpdate','BlastDBAdd', 'BlastDBDelete'] : self.CreateBLMenu(DBNAMES_ALL,Directory,menu) # Write BioLegato menus for bldna, blprotein to search local BLAST databases Directory = os.path.join(P.BIRCHDIR, 'dat', 'bldna', 'PCD', 'Database') for menu in ['BLASTNlocal', 'BLASTXlocal','TBLASTXlocal'] : self.CreateBLMenu(DBNAMES_INSTALLED,Directory,menu) Directory = os.path.join(P.BIRCHDIR, 'dat', 'blprotein', 'PCD', 'Database') for menu in ['BLASTPlocal','TBLASTNlocal'] : self.CreateBLMenu(DBNAMES_INSTALLED,Directory,menu) # Create name files so that FASTA can find BLAST databases self.CreateFastaNamefiles(P.BIRCHDIR,P.BLASTDB,DBNAMES_INSTALLED) Directory = os.path.join(P.BIRCHDIR, 'dat', 'bldna', 'PCD', 'Database') for menu in ['FASTADNA','FASTXY'] : self.CreateBLMenu(DBNAMES_INSTALLED,Directory,menu) Directory = os.path.join(P.BIRCHDIR, 'dat', 'blprotein', 'PCD', 'Database') for menu in ['FASTAPROTEIN','TFASTA'] : self.CreateBLMenu(DBNAMES_INSTALLED,Directory,menu) #======================== MAIN PROCEDURE ========================== def main(): """ Called when not in documentation mode. """ # Read parameters from command line P = Parameters() adminEmail = GetBIRCHProperties(P.BIRCHDIR,"adminEmail") if DEBUG : print('adminEmail: ' + adminEmail) Settings = BIRCHSettings(P) DBList = BLASTDBList(P,DBNAMES_ALL) print('DBNAMES_ALL: ' + str(DBNAMES_ALL)) LOGFN = os.path.join(P.BLASTDB,'update_blastdb.log') if P.CONFIGURE: print("--------- Configure ----------") # * * * * Step 1: Initialize local settings * * * * Settings.dict['BLASTDB'] = P.BLASTDB Settings.WriteBIRCHSettings(P.SFN) Settings.WriteBIRCHenvBourne(P) Settings.WriteBIRCHenvCsh(P) # * * * * Step 2: Revise BioLegato menus and FASTA .name files * * * * # Write BioLegato menus for birchadmin DBNAMES_INSTALLED = DBList.CheckBLASTDB(P.BLASTDB,DBNAMES_ALL) DBList.Revise(P,DBNAMES_INSTALLED,DBNAMES_ALL) elif P.REPORTLOCAL: print("--------- ReportLocal ----------") DBNAMES_INSTALLED = DBList.CheckBLASTDB(P.BLASTDB,DBNAMES_ALL) DBList.WriteLocalReport(P.BLASTDB,DBNAMES_INSTALLED) elif P.REPORTFTP: print("--------- ReportFTP ----------") DBList.CheckFTPsite(P.FTPSITE,DBNAMES_ALL) DBList.WriteFTPReport(P.BLASTDB,P.FTPSITE,DBNAMES_ALL) elif P.ADDFILES: print("---------- Addfiles ----------") LOGFILE = open(LOGFN,'w') LOGFILE.write('\n') LOGFILE.write('Local host: ' + LocalHostname() + '\n') LOGFILE.write('Local BLAST database directory: ' + P.BLASTDB + '\n') LOGFILE.write('\n') LOGFILE.write('FTP Site: ' + P.FTPSITE + '\n') LOGFILE.write('FTP Directory: ' + FTPINFO[P.FTPSITE]['dbdir'] + '\n') LOGFILE.write('\n') StartTime = datetime.datetime.now() LOGFILE.write('Adding files: \n') LOGFILE.write('Start time: ' + str(StartTime) + '\n') LOGFILE.write('\n') # * * * * Step 1: Add new files to the local database * * * * DBNAMES_INSTALLED = DBList.CheckBLASTDB(P.BLASTDB,DBNAMES_ALL) DBList.CheckFTPsite(P.FTPSITE,DBNAMES_ALL) DBList.AddFiles(P,DBNAMES_INSTALLED,DBNAMES_ALL,LOGFILE) # * * * * Step 2: Revise BioLegato menus and FASTA .name files * * * * DBList.Revise(P,DBNAMES_INSTALLED,DBNAMES_ALL) FinishTime = datetime.datetime.now() LOGFILE.write('\n') LOGFILE.write('Finish time: ' + str(FinishTime) + '\n') ElapsedTime = FinishTime - StartTime LOGFILE.write('Elapsed time: ' + str(ElapsedTime) + '\n') LOGFILE.close() # * * * * Step 3: Notify user when job is completed. * * * * Subject = 'BLupdate_blastdb.py --addfiles completed' Message = 'BLupdate_blastdb.py: Completed Installing Blast Databases
' LOGFILE = open(os.path.join(P.BLASTDB,'update_blastdb.log'),'r') for line in LOGFILE.readlines() : Message = Message + line + '
' LOGFILE.close() SendEmail(adminEmail,[adminEmail],Subject,Message) elif P.DELETEFILES: print("---------- Delete files ----------") LOGFILE = open(LOGFN,'w') LOGFILE.write('\n') LOGFILE.write('Local host: ' + LocalHostname() + '\n') LOGFILE.write('Local BLAST database directory: ' + P.BLASTDB + '\n') LOGFILE.write('\n') StartTime = datetime.datetime.now() LOGFILE.write('Deleting files: \n') LOGFILE.write('Start time: ' + str(StartTime) + '\n') LOGFILE.write('\n') # * * * * Step 1: Delete files from the local database * * * * DBNAMES_INSTALLED = DBList.CheckBLASTDB(P.BLASTDB,DBNAMES_ALL) for dbname in P.DBLIST : if dbname in DBNAMES_INSTALLED : LOGFILE.write(' ----- Deleting files from ' + dbname + ' -----\n') DeleteFilesByPrefix(P.BLASTDB,dbname,LOGFILE) else: LOGFILE.write(dbname + ' not installed. Doing nothing. \n') LOGFILE.write('\n') # * * * * Step 2: Revise BioLegato menus and FASTA .name files * * * * DBNAMES_INSTALLED = DBList.CheckBLASTDB(P.BLASTDB,DBNAMES_ALL) DBList.Revise(P,DBNAMES_INSTALLED,DBNAMES_ALL) FinishTime = datetime.datetime.now() LOGFILE.write('\n') LOGFILE.write('Finish time: ' + str(FinishTime) + '\n') ElapsedTime = FinishTime - StartTime LOGFILE.write('Elapsed time: ' + str(ElapsedTime) + '\n') LOGFILE.close() # * * * * Step 3: Notify user when job is completed. * * * * Subject = 'BLupdate_blastdb --deletefiles completed' Message = 'BLupdate_blastdb.py: Completed Deleting Blast Database files
' LOGFILE = open(os.path.join(P.BLASTDB,'update_blastdb.log'),'r') for line in LOGFILE.readlines() : Message = Message + line + '
' LOGFILE.close() SendEmail(adminEmail,[adminEmail],Subject,Message) elif P.UPDATEDB: print("---------- Update ----------") LOGFILE = open(LOGFN,'w') LOGFILE.write('\n') LOGFILE.write('Local host: ' + LocalHostname() + '\n') LOGFILE.write('Local BLAST database directory: ' + P.BLASTDB + '\n') LOGFILE.write('\n') LOGFILE.write('FTP Site: ' + P.FTPSITE + '\n') LOGFILE.write('FTP Directory: ' + FTPINFO[P.FTPSITE]['dbdir'] + '\n') LOGFILE.write('\n') StartTime = datetime.datetime.now() LOGFILE.write('Updating files: \n') LOGFILE.write('Start time: ' + str(StartTime) + '\n') LOGFILE.write('\n') # * * * * Step 1: Update files in the local database * * * * DBNAMES_INSTALLED = DBList.CheckBLASTDB(P.BLASTDB,DBNAMES_ALL) DBList.CheckFTPsite(P.FTPSITE,DBNAMES_ALL) DBList.UpdateFiles(P,DBNAMES_INSTALLED,DBNAMES_ALL,LOGFILE) # * * * * Step 2: Revise BioLegato menus and FASTA .name files * * * * DBList.Revise(P,DBNAMES_INSTALLED,DBNAMES_ALL) FinishTime = datetime.datetime.now() LOGFILE.write('\n') LOGFILE.write('Finish time: ' + str(FinishTime) + '\n') ElapsedTime = FinishTime - StartTime LOGFILE.write('Elapsed time: ' + str(ElapsedTime) + '\n') LOGFILE.close() # * * * * Step 3: Notify user when job is completed. * * * * Subject = 'BLupdate_blastdb --updatedb completed' Message = 'BLupdate_blastdb.py: Completed Updating Blast Databases
' LOGFILE = open(os.path.join(P.BLASTDB,'update_blastdb.log'),'r') for line in LOGFILE.readlines() : Message = Message + line + '
' LOGFILE.close() SendEmail(adminEmail,[adminEmail],Subject,Message) else: print(USAGE) # We need to flush the stdout buffer to avoid a sys.excepthook error message. # See http://stackoverflow.com/questions/12790328/how-to-silence-sys-excepthook-is-missing-error try: sys.stdout.flush() sys.stderr.flush() except: pass BM.exit_success() if (BM.documentor() or "-test" in sys.argv): pass else: main()