#!/usr/local/bin/python # April 28, 2005, Dr. Brian Fristensky, University of Manitoba # Description: Given a file of GI numbers, return sequences in # GenBank or Fasta format. We use SHoundGetGenBankff or SHoundGetFasta, # rather than the SHoundGetGenBankffList or SHoundGetFastaffList. These # List methods are unreliable for very large sequences. If even # one sequence can't be returned, nothing is returned. Instead # we retrieve one sequence at a time, so that the impact of # a failure is minimized. # Synopsis: SHGetSeq.py gifile -f outputformat outfile # options: infile GDE flat file, with name on line 1, followed by # comma separated list of tokens GI numbers # -f Either 'fasta' for fasta or 'genbank' for GenBank # outfile GenBank or FASTA file import sys import string import os # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - class Parameters : "Wrapper class for command line parameters" def __init__(self) : self.IFN = "" self.FORMAT = "fasta" self.METHOD = "SHoundGetFasta" self.OFN = "" # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - def READARGS(P) : "Read command line arguments into a Parameter object" NUMARGS = len(sys.argv) if NUMARGS >= 1 : P.IFN = sys.argv[1] I = 2 while (I < NUMARGS) : if sys.argv[I] == "-f" : I = I + 1 if I < NUMARGS : TEMP = sys.argv[I] if (TEMP in ['fasta','genbank']) : P.FORMAT = TEMP I = I + 1 else : P.OFN = sys.argv[I] I = NUMARGS return # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - class FILE : "Wrapper class for files" def __init__(self,FILENAME,MODE) : self.FN = FILENAME self.F = open(FILENAME,MODE) self.LINE = "" # most recent line read # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - class IDLST : "Wrapper class for ID lists" def __init__(self) : self.NAME = "" self.LST = [] # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # Read in old and new strings, striping # leading and trailing whitespace, including # newline characters. def GETGDELIST(INFILE,NAMEFLAG,GILST) : # Read name line while (INFILE.LINE != "" and GILST.NAME == "") : INFILE.LINE = INFILE.LINE.strip() if len(INFILE.LINE) > 0 : if INFILE.LINE[0] == NAMEFLAG : GILST.NAME = INFILE.LINE[1:] INFILE.LINE = INFILE.F.readline() # Read GI list GILST.LST = [] if GILST.NAME != "" : # GDE wraps the flat file with newlines every 60 # characters. # Next, we have to delete the newlines to turn the entire # file into a single long string called BIGLINE BIGLINE = "" DONE = 0 while (INFILE.LINE != "" and DONE ==0) : TMPLINE = INFILE.LINE.strip() if len(TMPLINE) > 0 : if TMPLINE[0] == NAMEFLAG : DONE = 1 else: BIGLINE = BIGLINE + TMPLINE INFILE.LINE = INFILE.F.readline() # parse the string as a comma separated list GILST.LST = BIGLINE.split(',') return # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # Retrieve the sequences and write them to outfile def GETSEQS(GILST,METHOD,OFN) : # Create a temporary filename TFN = 'SHGetSeq.' + str(os.getpid()) LEN = len(GILST.LST) if LEN > 0 : for i in range(0,LEN) : MPI = str(GILST.LST[i]) COMMAND = 'leash -mn ' + METHOD + ' -mpi ' + MPI + ' -of ' + TFN os.system(COMMAND) OKAY = False if os.path.exists(TFN) : if os.path.getsize(TFN) > 0 : OKAY = True # Append the contents of the temporary file to the output file COMMAND = 'cat ' + TFN + '>> ' + OFN os.system(COMMAND) os.remove(TFN) return #======================== MAIN PROCEDURE ========================== P = Parameters () READARGS(P) #---------- Set global constants NAMEFLAG = '"' # 1st character on the name line, indicating # the beginning of the next data list MAXSEQ = 1000000 # maximum sequence length that SeqHound can retrieve # Default is SHoundGetFastaList if (P.FORMAT == 'genbank') : P.METHOD = 'SHoundGetGenBankff' INFILE = FILE(P.IFN,'r') # ------------------------- MAIN LOOP ----------------------- # GDE flatfile may contain 0 or more lists, so we iterate # for each list. # Note that GETGDELIST takes care of reading in the next # input line. INFILE.LINE = INFILE.F.readline() # LINE contains the most recently-read line while (INFILE.LINE != "") : # Read in GDE flat file GILST = IDLST() GETGDELIST(INFILE,NAMEFLAG,GILST) # Create a new list containing only those entries # whose length is less than MAXSEQ #Retrieve the sequence and write it to outfile GETSEQS(GILST,P.METHOD,P.OFN) INFILE.F.close()