#!/usr/bin/env python3 # June 3, 2023, Dr. Brian Fristensky, University of Manitoba """ Description: Extract a subset of sequences from a GDE flatfile." Synopsis: BLExtractSubset.py namefile infile outfile" Files: namefile list of sequence names to be written. If namefile is a TSV file, the leftmost field will be read as a name. infile GDE flat file containing all sequences outfile GDE flat file containing subset of sequences listed in namefile """ import sys import string import os.path import shutil # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # Read in namefile def GETNAMES(FN) : FILE = open(FN,'r') NAMES = [] for LINE in FILE : if LINE.startswith("#") : pass else: LINE = LINE.strip() if len(LINE) > 0 : F0 = LINE.split("\t")[0] if not F0 in NAMES : NAMES.append(F0) FILE.close() return NAMES # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # Read each sequence in infile. If a sequence name is in the # list, write the sequence to a separate file in the temporary # directory def GETSEQS(INFILE,TEMPDIR) : os.chdir(TEMPDIR) # create a dummy file just so that we have a file to close # the first time the loop is executed. This also takes care # of files in which the first sequence begins after the first line OUTFILE = open('/dev/null','w') for LINE in INFILE : LINE = LINE.strip() if len(LINE) > 0 : if LINE[0] in SEQFLAGS : #start a new output file TOKENS = LINE.split() SEQNAME = TOKENS[0][1:] OUTFILE.close() OUTFILE = open(SEQNAME + '.flat','w') OUTFILE.write(LINE + '\n') else : #copy the line to output file OUTFILE.write(LINE + '\n') else : pass INFILE.close() # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # Write only the sequences listed in namefile to the output file, # in the order they appear in namefile def WRITESEQS(NAMES,OFN) : OUTFILE = open(OFN,'w') if len(NAMES) > 0: for N in NAMES : FN = N + '.flat' if os.path.exists(FN) : INFILE = open(FN,'r') for LINE in INFILE : OUTFILE.write(LINE) INFILE.close() OUTFILE.close() os.chdir(CWD) #======================== MAIN PROCEDURE ========================== #---------- Set global variables NFN = sys.argv[1] IFN = sys.argv[2] OFN = sys.argv[3] OUTFILE = open(OFN,'w') SEQFLAGS = (['>','#','%','"','@']) # indicate sequence name line CWD = os.getcwd() if os.path.exists(NFN) : # Read in namefile NAMES = GETNAMES(NFN) if os.path.exists(IFN) : # Create a temporary working directory TEMPDIR = 'BLExtractSubset.' + str(os.getpid()) os.mkdir(TEMPDIR,0o700) INFILE = open(IFN,'r') GETSEQS(INFILE,TEMPDIR) INFILE.close() #Write sequence to outfile WRITESEQS(NAMES,os.path.join(CWD,OFN)) # Clean up shutil.rmtree(TEMPDIR) else: print('BLExtract Subset: File not found: ' + IFN) else : print('BLExtract Subset: File not found: ' + NFN)