#!/usr/local/bin/python # Jyly 18, 2008, Dr. Brian Fristensky, University of Manitoba " Description: Extract a subset of sequences from a GDE flatfile." " Synopsis: BLExtractSubset.py namefile infile outfile" # Files: namefile list of sequence names to be written # infile GDE flat file containing all sequences # outfile GDE flat file containing subset of sequences listed in namefile import sys import string import os.path import shutil # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # Read in namefile def GETNAMES(FN) : FILE = open(FN,'r') NAMES = [] for LINE in FILE : LINE = LINE.strip() if len(LINE) > 0 : NAMES.append(LINE) else : pass FILE.close() return NAMES # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # Read each sequence in infile. If a sequence name is in the # list, write the sequence to a separate file in the temporary # directory def GETSEQS(INFILE,TEMPDIR) : os.chdir(TEMPDIR) # create a dummy file just so that we have a file to close # the first time the loop is executed. This also takes care # of files in which the first sequence begins after the first line OUTFILE = open('/dev/null','w') for LINE in INFILE : LINE = LINE.strip() if len(LINE) > 0 : if LINE[0] in SEQFLAGS : #start a new output file TOKENS = LINE.split() SEQNAME = TOKENS[0][1:] OUTFILE.close() OUTFILE = open(SEQNAME + '.flat','w') OUTFILE.write(LINE + '\n') else : #copy the line to output file OUTFILE.write(LINE + '\n') else : pass INFILE.close() # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # Write only the sequences listed in namefile to the output file, # in the order they appear in namefile def WRITESEQS(NAMES,OFN) : OUTFILE = open(OFN,'w') if len(NAMES) > 0: for N in NAMES : FN = N + '.flat' if os.path.exists(FN) : INFILE = open(FN,'r') for LINE in INFILE : OUTFILE.write(LINE) INFILE.close() OUTFILE.close() os.chdir(CWD) #======================== MAIN PROCEDURE ========================== #---------- Set global variables NFN = sys.argv[1] IFN = sys.argv[2] OFN = sys.argv[3] OUTFILE = open(OFN,'w') SEQFLAGS = (['>','#','%','"','@']) # indicate sequence name line CWD = os.getcwd() if os.path.exists(NFN) : # Read in namefile NAMES = GETNAMES(NFN) if os.path.exists(IFN) : # Create a temporary working directory TEMPDIR = 'BLExtractSubset.' + str(os.getpid()) os.mkdir(TEMPDIR,0700) INFILE = open(IFN,'r') GETSEQS(INFILE,TEMPDIR) INFILE.close() #Write sequence to outfile WRITESEQS(NAMES,CWD + '/' + OFN) # Clean up shutil.rmtree(TEMPDIR) else: print 'BLExtract Subset: File not found: ' + IFN else : print 'BLExtract Subset: File not found: ' + NFN