#!/usr/bin/env python import birchscript import re import os import os.path import shutil import subprocess import sys # FEATURES - a c-shell script (version 14 June 09) # FEATURES is a front end for GETOB. FEATURES provides an interactive # menu interface, as well as automating the process of resolving # nested database expressions # This script automates, as much as is possible, two tasks. First, getob # expects all of its input to come from files, as described in getob.doc. # The parameter menu lets the user input some of the data directly at the # keyboard, for cases in which only one data element (eg. accession number) # is needed. Otherwise, FEATURES prompts the user for a filename. # The second thing that FEATURES does is to run getob and check to see # whether there are unresolved expressions in the output. If there are, # getob is called as many times as is necessary to resolve all expressions. ##################################################################### # set parameter defaults #################################################################### interactive = True NICELEVEL = 10 jobid = os.getpid() featype = "" features = "" entrytype = "n" entries = "" dbtype = "g" database = "" where = "a" outfile = "" resolve = "" acno = False needtosplit = False options = "" FEAINF = "FEA." + jobid + ".inf" FEANAM = "FEA." + jobid + ".nam" FEAOUT = "FEA." + jobid + ".out" ##################################################################### # Read options from the command line and set parameters, # or prompt user for parameters at terminal. #################################################################### numargs = len(sys.argv) if numargs != 0: #--------------------------------------------------- #parameters given in command line interactive = False files = "" #One argument form: a FEATURES expression is given as the input, returning ONLY #the resultant sequence to the standard output. if numargs == 1: # -h : print usage message if sys.argv[1] == "-h": print "Usage: features" print " features expression" print " features [-f featurekey | -F keyfile]" print " [-n name |-a accession | -e expression |" print " -N namefile |-A accfile | -E expfile]" print " [-u dataset | -U dataset | -g ]" exit() else: resolve = "r" features = "" entries = sys.argv[1] #Make sure the expression begins with '@' if entries[0] != '@': entries = "@" + entries h_fea = open(FEAINF, "w") print >> h_fea, "OBJECTS" #make dummy infile print >> h_fea, "SITES" #for getob h_fea.close() h_feanam = open(FEANAM, "w") print >> h_feanam, entries h_feanam.close() #accession number will be used for output filenames # set base = `cut -c2-80 $FEANAM | cut -f1 -d':'` base = entries[1:].split(':')[0] acno = True else: #Multiple argument form: feature keys, entries, expressions, and databases are #specified on the command line. Message, sequence and expression files are written. index = 1 while index < numargs: if sys.argv[index] in ("-f", "-F"): index += 1 features = sys.argv[index] resolve = "" h_fea = open(FEAINF, "w") print >> h_fea, "OBJECTS" if sys.argv[index] == "-f": print >> h_fea, features elif sys.argv[index] == "-F": birchscript.cat_to(features, h_fea) print >> h_fea, "SITES" h_fea.close() elif sys.argv[index] in ("-a", "-A", "-n", "-N"): index += 1 entries = sys.argv[index] h_fea = open(FEANAM, "w") if sys.argv[index] in ("-a", "-n"): base = entries #accession number used for output file names print >> h_fea, entries elif sys.argv[index] in ("-A", "-N"): base = os.path.splitext(os.path.basename(entries))[0] #raw filename used for output filenames birchscript.cat_to(entries, h_fea) if sys.argv[index] in ("-a", "-A"): acno = True elif sys.argv[index] in ("-n", "-N"): acno = False elif sys.argv[index] == "-e": resolve = "r" features = "" index += 1 entries = sys.argv[index] #Make sure the expression begins with '@' if entries[0] != "@": entries = "@" + entries h_feanam = open(FEANAM, "w") print >> h_feanam, entries h_feanam.close() h_fea = open(FEAINF, "w") print >> h_fea, "OBJECTS" #make dummy infile print >> h_fea, "SITES" #for getob h_fea.close() #accession number will be used for output filenames #set base = `cut -c2-80 $FEANAM |cut -f1 -d':'` base = entries[1:].split(':')[0] acno = True elif sys.argv[index] == "-E": resolve = "r" features = "" index += 1 entries = sys.argv[index] base = os.path.splitext(os.path.basename(entries))[0] #raw filename used for output filenames h_feanam = open(FEANAM, "w") birchscript.cat_to(entries, h_feanam) h_feanam.close() h_fea = open(FEAINF, "w") print >> h_fea, "OBJECTS" #make dummy infile print >> h_fea, "SITES" #for getob h_fea.close() acno = True elif sys.argv[index] in ("-u", "-U"): dbtype = sys.argv[index][1] index += 1 database = sys.argv[index] dbname = os.path.splitext(database)[0] extension = os.path.splitext(database)[1] if sys.argv[index] == "-U": base = dbname if extension == ".gen": #If user-defined database is one or more #GenBank entries, it will have to be #split by splitdb for use by programs. needtosplit = True else: needtosplit = False anofile = dbname + ".ano" seqfile = dbname + ".wrp" indfile = dbname + ".ind" index += 1 end # while #Make sure all necessary data has been entered if resolve != "r" and features == "": print '>>> Features must be specified' exit() # csh subtlety: $entries must be quoted to allow for blanks in # the string if entries == "" and dbtype != "U": print '>>> Entries must be specified' exit() else: #--------------------------------------------------------------- # Interactive parameter input complete = False while not complete: #Display current parameter settings print '___________________________________________________________________' print ' FEATURES - Version 28 Jan 05 ' print ' Please cite: Fristensky (1993) Nucl. Acids Res. 21:5997-6003' print '___________________________________________________________________' print 'Features: ' + features print 'Entries: ' + entries print 'Dataset: ' + database print '___________________________________________________________________' print ' Parameter Description Value' print '-------------------------------------------------------------------' print '1).................... FEATURES TO EXTRACT ....................> ' + featype print ' f:Type a feature at the keyboard ' print ' F:Read a list of features from a file' print '2)....................ENTRIES TO BE PROCESSED (choose one).....> ' + entrytype print ' Keyboard input - n:name a:accession # e:expression' print ' File input - N:name(s) A:accession #(s) E:expression(s)' print '3)....................WHERE TO GET IT .........................> ' + dbtype print ' u:Genbank dataset g:complete GenBank database' print ' U: same as u, but all entries' print '4)....................WHERE TO SEND IT ........................> ' + where print ' s:Each feature to a separate file a:All output to same file' print ' ---------------------------------------------------------------' #Prompt for parameter to change paramnum = int(raw_input(' Type number of your choice or 0 to continue:')) if paramnum == 0: complete = True elif paramnum == 1: if resolve == "r": print '>>> No features needed to resolve expressions.' else: temp = raw_input('Enter one of {fF}:') if temp in ("f", "F"): featype = temp if featype == "f": resolve = "" features = raw_input('Enter feature:') h_fea = open(FEAINF, "w") print >> h_fea, "OBJECTS" print >> h_fea, features print >> h_fea, "SITES" h_fea.close() elif featype == "F": resolve = "" features = raw_input('Enter filename: ') h_fea = open(FEAINF, "w") birchscript.cat_to(features, h_fea) h_fea.close() else: print '>>> Invalid choice' # f,F # resolve == r elif paramnum == 2: temp = raw_input('Enter one of {naeNAE}:') if temp in ("n", "a", "e", "N", "A", "E"): entrytype = temp if entrytype == "n": entries = raw_input('Enter name:') base = entries #entry name used for output file names h_feanam = open(FEANAM, "w") print >> h_feanam, entries h_feanam.close() acno = False elif entrytype == "a": entries = raw_input('Enter accession number:') base = entries #accession number used for # output file names h_feanam = open(FEANAM, "w") print >> h_feanam, entries h_feanam.close() acno = True elif entrytype == "N": entries = raw_input('Enter filename:') base = os.path.splitext(os.path.basename(entries))[0] #raw filename used for output filenames h_feanam = open(FEANAM, "w") birchscript.cat_to(entries, h_feanam) h_feanam.close() acno = False elif entrytype == "A": entries = raw_input('Enter filename:') base = os.path.splitext(os.path.basename(entries))[0] #raw filename used for output filenames h_feanam = open(FEANAM, "w") birchscript.cat_to(entries, h_feanam) h_feanam.close() acno = True elif entrytype == "e": resolve = "r" features = "" entries = raw_input('Enter expression:') #Make sure the expression begins with '@' if entries[0] != "@": entries = "@" + entries h_feanam = open(FEANAM, "w") print >> h_feanam, entries h_feanam.close() h_fea = open(FEAINF, "w") print >> h_fea, "OBJECTS" #make dummy infile print >> h_fea, "SITES" #for getob h_fea.close() #accession number will be used for output filenames base = entries[1].split(':')[0] acno = True elif entrytype == "E": resolve = "r" features = "" entries = raw_input('Enter filename:') base = os.path.splitext(os.path.basename(entries))[0] #raw filename used for output filenames h_feanam = open(FEANAM, "w") birchscript.cat_to(entries, h_feanam) h_feanam.close() h_fea = open(FEAINF, "w") print >> h_fea, "OBJECTS" #make dummy infile print >> h_fea, "SITES" #for getob h_fea.close() acno = True else: print '>>> Invalid choice' #naeNAE elif paramnum == 3: temp = raw_input('Enter one of {uUg}:') if temp in ("U", "u"): dbtype = temp database = raw_input('Enter filename:') dbname = os.path.splitext(database)[0] if temp == "U": base = dbname extension = os.path.splitext(database)[1] if extension == ".gen": #If user-defined database is one or more #GenBank entries, it will have to be #split by splitdb for use by programs. needtosplit = True else: needtosplit = False anofile = dbname + ".ano" seqfile = dbname + ".wrp" indfile = dbname + ".ind" elif temp == "g": dbtype = "g" else: print '>>> Invalid choice' elif paramnum == 4: temp = raw_input('Enter one of {sa}:') if temp in ("s", "a"): where = temp else: print '>>> Invalid choice' #Make sure all necessary data has been entered if complete: if resolve != "r" and features == "": print '>>> Features must be specified (option 1)' complete = False # csh subtlety: $entries must be quoted to allow for blanks in # the string if entries == "" and dbtype != "U": print '>>> Entries must be specified (option 2)' complete = False #while # argv #set options if resolve == "r": options += " -r" if acno: options += " -c" if where == "s": options += " -f" ##################################################################### # For a given database, extract the specified features. #################################################################### if interactive: termout = "/dev/tty" print 'Messages will be written to ' + base + ".msg" if where == "a": print 'Final sequence output will be written to ' + base + ".out" else: print 'Output for each entry will be written to individual files.' print 'Filenames will be in the form: name.obj' if resolve != "r": print 'Expressions will be written to ' + base + ".exp" else: termout = "/dev/null" # interactive # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # The first pass will usually resolve all feature expressions. #Retrieve GenBank entries, if necessary if dbtype == "g": if resolve == "r": #Unresolved expressions will begin with '@' in first column. #Find lines with unresolved expressions and extract the #accession numbers for use by fetch. egrep -e ^@ $FEANAM > UNRESOLVED.$jobid.fea # first, extract ACCESSION number from between @ and : # If it is in the form ACCESSION.VERSION, truncate the version field cut -c2-80 UNRESOLVED.$jobid.fea |cut -f1 -d':' |cut -f1 -d'.' > UNRESOLVED.$jobid.nam fetchfile = "UNRESOLVED." + jobid + ".nam" else: fetchfile = FEANAM anofile = base + ".ano" seqfile = base + ".wrp" indfile = base + ".ind" fetch $fetchfile FEA.$jobid.gen > $termout if os.path.exist("FEA." + jobid + ".gen"): os.nice(NICELEVEL) splitdb FEA.$jobid.gen $anofile $seqfile $indfile os.nice(0) os.remove("FEA." + jobid + ".gen") else: exit() #Extract features from the entries if interactive: print 'Extracting features...' if needtosplit: os.nice(NICELEVEL) splitdb $database $anofile $seqfile $indfile os.nice(0) if dbtype == 'U': shutil.copyfile(indfile, FEANAM) # quick test to make sure that $FEANAM and $anofile aren't empty @ NUMLINES = (`wc -l $FEANAM | sed -e "s/^[ ]*//" | cut -f1 -d" "`) grep 'LOCUS' $anofile > FEA.$jobid.test os.nice(NICELEVEL) if NUMLINES != 0 and os.path.getsize("FEA." + jobid + ".test") != 0: h_base = open(base + "." + jobid + ".term") if where == "a": if numargs == 1: subprocess.call(["getob", options, FEAINF, FEANAM, anofile, seqfile, \ indfile, "/dev/null", FEAOUT, "/dev/null"], stdout=h_base) else: subprocess.call(["getob", options, FEAINF, FEANAM, anofile, seqfile, indfile, \ base + ".msg", FEAOUT, base + ".exp"], stdout=h_base) else: subprocess.call(["getob", options, FEAINF, FEANAM, anofile, seqfile, indfile, \ base + ".msg", base + ".exp"], stdout=h_base) os.nice(0) # Clean up a bit. Database files could be huge! if dbtype == "g" or needtosplit: os.remove(anofile) os.remove(seqfile) os.remove(indfile) #Pull out all lines containing indirect references # RESTORE THESE LINES, AND DELETE THE SUBSEQUENT 5 LINES, # ONCE WE GET A WORKING EQUIVALENT OT NFETCH. # if ($where == a) then # egrep -e ^@ $FEAOUT > UNRESOLVED.$jobid.fea # else # egrep -e ^@ *.obj > UNRESOLVED.$jobid.fea # endif if where == "a": ls_dir = ["FEA." + jobid + ".out"] else: ls_dir = [filename for filename in os.listdir(os.getcwd()) if os.path.splitext(filename)[1] == ".obj" and os.path.isfile(filename)] # egrep -v -e ^@ *.obj > temp.$jobid.fea h_temp = open("temp." + jobid + ".fea", "w") for file in ls_dir: h_file = open(file) for line in h_file: if not line.startswith('@'): h_temp.write(line) h_file.close() h_temp.close() shutil.copyfile("temp." + jobid + ".fea", "FEA." + jobid + ".out") # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # This part resolves any unresolved expressions that can be resolved. # Only in very rare cases will this loop execute more than once. # Prevent loop from executing by generating a zero length file. # We'll have to leave these features unresolved until we get # a program to retrieve GenBank entries across the network. # while !(-z UNRESOLVED.$jobid.fea) #is not empty """ while ( 1 == 0): # never true if ($where == s) then echo "\>>> Can't resolve references sent to separate files." echo '\>>> Re-run FEATURES separately for each unresolved reference' echo "\>>> See $base.msg to find unresolved references." exit else if interactive: print 'Resolving Feature expression(s)...' #extract accession numbers to be retrieved # first, extract ACCESSION number from between @ and : # If it is in the form ACCESSION.VERSION, truncate the version field cut -c2-80 UNRESOLVED.$jobid.fea | cut -f1 -d':' | cut -f1 -d'.' > $FEANAM #retrieve the sequences into a new file, and create #a database subset to be used by getob. If $GB is set, then #entries will be fetched from the online GenBank database. #Otherwise, features will attempt to get entries from the #user-generated database subset. if (${?GB} | ${?XYLEM_RHOST}) then set fetchoptions = "" else set fetchoptions = "-G $database" endif # fetch $fetchoptions $FEANAM FEA.$jobid.gen >$termout # lfetch $fetchoptions $FEANAM FEA.$jobid.gen >$termout # At present we don't have a program to retrieve GenBank entries # from a remote database, so we just create a dummy output file, # This means that features which reference other entries will not # be resolved. echo '' > FEA.$jobid.gen # test for empty GenBank file #grep 'LOCUS' FEA.$jobid.gen > FEA.$jobid.test #if !(-z FEA.$jobid.test) then @ COUNT = `grep -c 'LOCUS' FEA.$jobid.gen` echo 'COUNT: '$COUNT if ($COUNT > 0) then splitdb FEA.$jobid.gen FEA.$jobid.ano FEA.$jobid.wrp FEA.$jobid.ind #run getob again to resolve indirect references echo 'Extracting features...' > $termout mv $FEAOUT UNRESOLVED.$jobid.out if ($numargs == 1) then nice $NICELEVEL getob -r -c $FEAINF UNRESOLVED.$jobid.out FEA.$jobid.ano\ FEA.$jobid.wrp FEA.$jobid.ind /dev/null $FEAOUT >>\ $base.$jobid.term else nice $NICELEVEL getob -r -c $FEAINF UNRESOLVED.$jobid.out FEA.$jobid.ano\ FEA.$jobid.wrp FEA.$jobid.ind FEA.$jobid.msg $FEAOUT\ >> $base.$jobid.term cat FEA.$jobid.msg >> $base.msg endif else echo "" > $FEAOUT endif #Pull out all lines containing indirect references #ie. lines beginning with '@' egrep -e ^@ $FEAOUT > UNRESOLVED.$jobid.fea endif end # while""" if where == "a": #One argument form, command mode #Sequence output only goes to standard output. if numargs == 1: birchscript.cat_to("FEA." + jobid + ".out", sys.stdout) #Multi-argument form or interactive mode else: shutil.copyfile("FEA." + jobid + ".out", base + ".out") ##################################################################### # Clean up. #################################################################### for file in os.listdir(os.getcwd()): filename = os.path.basename(file) if filename.startswith("FEA." + jobid + ".") or filename.startswith("UNRESOLVED." + jobid + ".") \ or filename.endswith("." + jobid + ".term"): os.remove(os.path.join(os.getcwd(), filename))