#!/usr/local/bin/python # February 22, 2006, Dr. Brian Fristensky, University of Manitoba # Description: Given a file of ID numbers, return IDs or sequences in # GenBank or Fasta format. The # List methods are unreliable for very large lists. If even # one sequence can't be returned, nothing is returned. Instead # we retrieve one sequence at a time, so that the impact of # a failure is minimized. # Synopsis: SHGet.py gifile -m method -e extension outfile # options: infile GDE flat file, with name on line 1, followed by # comma separated list of tokens or GI numbers # -e extension to use for names eg. gi, taxid # -mn SeqHound Method (eg.SHoundGetFasta, SHoundSequenceLength) # outfile GenBank or FASTA file, list file import sys import string import os # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - class Parameters : "Wrapper class for command line parameters" def __init__(self) : self.HELP = 'n' self.IFN = "" self.METHOD = "" self.EXT = "" self.OFN = "" self.SEQMETHODS = ['SHoundGetFasta','SHoundGetGenBankff','SHoundGetXMLSeqEntry'] self.SEQLISTMETHODS = ['SHoundGetFastaList','SHoundGetGenBankffList'] # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - def READARGS(P) : "Read command line arguments into a Parameter object" NUMARGS = len(sys.argv) if NUMARGS >= 1 : if sys.argv[1] == "-h" : P.HELP = 'y' else : P.IFN = sys.argv[1] I = 2 while (I < NUMARGS) : if sys.argv[I] == "-mn" : I = I + 1 if I < NUMARGS : P.METHOD = sys.argv[I] I = I + 1 if sys.argv[I] == "-e" : I = I + 1 if I < NUMARGS : P.EXT = sys.argv[I] I = I + 1 if sys.argv[I] == "-h" : I = I + 1 P.HELP = 'y' else : P.OFN = sys.argv[I] I = NUMARGS return # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - class FILE : "Wrapper class for files" def __init__(self,FILENAME,MODE) : self.FN = FILENAME self.F = open(FILENAME,MODE) self.LINE = "" # most recent line read # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - class IDLST : "Wrapper class for ID lists" def __init__(self) : self.NAME = "" self.STR = "" self.LST = [] # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # Print usage message def PRINTHELP() : print('Usage:') print('python SHGet.py infile -mn SeqHoundMethod -e extension outfile') print(' infile: GDE flatfile of identifieres eg. gi, taxid') print(' SeqHoundMethod: any method, as defined in the SeqHound API') print(' (see http://seqhound.blueprint.org)') print(' extension: string to use as a name extension (eg. gi,len)') print(' The extension helps identify the type of output') print(' outfile: output file of ids or sequences') print('') # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # Read in old and new strings, striping # leading and trailing whitespace, including # newline characters. def GETGDELIST(INFILE,NAMEFLAG,GILST) : # Read name line while (INFILE.LINE != "" and GILST.NAME == "") : INFILE.LINE = INFILE.LINE.strip() if len(INFILE.LINE) > 0 : if INFILE.LINE[0] == NAMEFLAG : GILST.NAME = INFILE.LINE[1:] INFILE.LINE = INFILE.F.readline() # Read GI list GILST.STR = "" if GILST.NAME != "" : # GDE wraps the flat file with newlines every 60 # characters. # Next, we have to delete the newlines to turn the entire # file into a single long string called BIGLINE BIGLINE = "" DONE = 0 while (INFILE.LINE != "" and DONE ==0) : TMPLINE = INFILE.LINE.strip() if len(TMPLINE) > 0 : if TMPLINE[0] == NAMEFLAG : DONE = 1 else: BIGLINE = BIGLINE + TMPLINE INFILE.LINE = INFILE.F.readline() # parse the string as a comma separated list GILST.STR = BIGLINE return # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # Retrieve the sequences and write them to outfile def GETSEQS(GILST,METHOD,OPTION,OFN) : print METHOD # Create a temporary filename TFN = 'SHGet.' + str(os.getpid()) LEN = len(GILST.STR) if LEN > 0 : COMMAND = 'leash -mn ' + METHOD + OPTION + GILST.STR + ' -of ' + TFN os.system(COMMAND) OKAY = False if os.path.exists(TFN) : if os.path.getsize(TFN) > 0 : OKAY = True # Append the contents of the temporary file to the output file COMMAND = 'cat ' + TFN + '>> ' + OFN os.system(COMMAND) os.remove(TFN) return # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # Retrieve the IDs and write them to outfile def GETIDS(GILST,METHOD,OPTION,EXT,OFN) : print METHOD # Create a temporary filename TFN = 'SHGet.' + str(os.getpid()) L = GILST.NAME.rfind('.') if L > -1 : NEWNAME = GILST.NAME[:L] + '.' + EXT else : NEWNAME = GILST.NAME + '.' + EXT print NEWNAME LEN = len(GILST.STR) if LEN > 0 : # The -sep option in Leash2.2 is broken. Until it's fixed, # we'll have to do a workaround. TTFN = TFN + 'temp' COMMAND = 'leash -mn ' + METHOD + OPTION + GILST.STR + ' -sep comma ' + ' -of ' + TTFN os.system(COMMAND) COMMAND = 'tr \"\\n\" \",\" < ' + TTFN + ' > ' + TFN os.system(COMMAND) os.remove(TTFN) OKAY = False if os.path.exists(TFN) : if os.path.getsize(TFN) > 0 : OKAY = True # Append the contents of the temporary file to the output file # The first command writes a name in the form # "name COMMAND = 'echo \\' + NAMEFLAG + NEWNAME + ' >> ' + OFN os.system(COMMAND) COMMAND = 'cat ' + TFN + '>> ' + OFN os.system(COMMAND) COMMAND = 'echo \"\" >> ' + OFN os.system(COMMAND) os.remove(TFN) return #======================== MAIN PROCEDURE ========================== P = Parameters () READARGS(P) if P.HELP == 'y' : PRINTHELP() elif not (os.path.exists(P.IFN)) : print 'SHGet.py: ' + P.IFN + ' not found' else: #---------- Set global constants NAMEFLAG = '"' # 1st character on the name line, indicating # the beginning of the next data list MAXSEQ = 1000000 # maximum sequence length that SeqHound can retrieve INFILE = FILE(P.IFN,'r') # ------------------------- MAIN LOOP ----------------------- # GDE flatfile may contain 0 or more lists, so we iterate # for each list. # Note that GETGDELIST takes care of reading in the next # input line. INFILE.LINE = INFILE.F.readline() # LINE contains the most recently-read line while (INFILE.LINE != "") : # Read in GDE flat file GILST = IDLST() GETGDELIST(INFILE,NAMEFLAG,GILST) print GILST.NAME # Create a new list containing only those entries # whose length is less than MAXSEQ #Retrieve the sequence and write it to outfile if P.METHOD in P.SEQMETHODS : GETSEQS(GILST,P.METHOD,' -mpil ',P.OFN) elif P.METHOD in P.SEQLISTMETHODS : GETSEQS(GILST,P.METHOD,' -mpi ',P.OFN) elif P.METHOD.endswith('List') : GETIDS(GILST,P.METHOD,' -mpi ',P.EXT,P.OFN) else : GETIDS(GILST,P.METHOD,' -mpil ',P.EXT,P.OFN) INFILE.F.close()