#!/usr/bin/env python3 """ blfastaout.py - Translate FASTA table output into other formats, and open those files using the appropriate application, or write output to a file. Synopsis: blfastaout.py infile [--delete] --destination dest [--outfile filename] infile - tsv file produced by fasta3 -m "F8C" option --delete - Delete infile when finished. This is mainly intended for running from BioLegato, where infile is a temporary file. --destination dest - dest is one of the following: For FASTA viewable Report: textedit - open output files in text editor specified by the $BL_TextEditor environment variable browser - open in web browser specified by $BL_browser textfile - write to files, using the basename specified by destination. htmlfile - write to HTML file For FASTA tsv Report: blnfetch - Open in blnfetch, BioLegato interface for retrieving DNA/RNA entries using ACCESSION numbers blpfetch - Open in blpfetch, BioLegato interface for retrieving protein entries using ACCESSION numbers tsvfile - write to a tsvfile. --outfile - filename for saving an output file. @modified: November 9, 2024 @author: Brian Fristensky @contact: brian.fristensky@umanitoba.ca """ import argparse import os import shutil import subprocess import sys DEBUG = True class Parameters: """ Wrapper class for command line parameters """ def __init__(self): """ Initializes arguments: IFN="" DESTINATION="" OFN="" PID = str(os.getpid()) Then calls read_args() to fill in their values from command line """ self.IFN = "" self.DELETEFILE = False self.DESTINATION = "" self.OFN = "" self.PID = str(os.getpid()) self.read_args() def read_args(self): """ Read command line arguments into a Parameter object """ parser = argparse.ArgumentParser() parser.add_argument("infile", type=str, help="Input filename") parser.add_argument("--delete", action="store_true", default=False, help="Delete input file when done") parser.add_argument("--destination", type=str, default="blnfetch", help="Name of program to launch or type of file to save") parser.add_argument("--outfile", type=str, default="", help="Filename used when action is to save a file") args = parser.parse_args() self.IFN = args.infile self.DELETEFILE = args.delete self.DESTINATION = args.destination self.OFN = args.outfile if DEBUG : print('------------ Parameters from command line ------' + '\n') print(' IFN: ' + self.IFN) print(' DELETEFILE: ' + str(self.DELETEFILE)) print(' DESTINATION: ' + self.DESTINATION) print(' OFN: ' + self.OFN) # ------------------------------------------------------------------------------- class FastaTable: """ Methods and data for FASTA output in table format. """ def __init__(self): """ Initializes arguments: rawlines = [] headerlines = [] database = "" """ self.header = [] self.ColumnHeadings = [] self.Data = [] self.Database = "" # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - def ReadOutput(self,IFN) : """ For tabular FASTA output, parse the Fields line so that the names of fields appear as column headings. """ # To make processing easier, get rid of lines after the list of # hits. The first line after the hits starts with STOPFLAG tempfile=open(IFN,"r") lines = [] #for l in tempfile.readlines() : # lines.append(l.strip()) l = tempfile.readline() STOPFLAG = "# FASTA processed" while not l == "" and not l.startswith(STOPFLAG) : lines.append(l.strip()) l = tempfile.readline() tempfile.close() # The column headings generated by FASTA include some fields not found # in the output, so we get rid of those headings. def FixColumnHeadings(line) : tokens = ["subject id"] tokens = tokens + line.split(",")[4:] return tokens # First, we process the comment lines, printing all comments but the Field line # Then, we print the modified field line. i = int(0) while (i < len(lines)) and (lines[i].startswith("#")) : # Hold on to the comment line with the column headings. if lines[i].startswith("# Fields:") : self.ColumnHeadings = FixColumnHeadings(lines[i]) print(self.ColumnHeadings) elif lines[i].startswith("# Database:") : RawDBLine = lines[i] self.header.append(lines[i]) else : self.header.append(lines[i]) i += 1 # Next, extract the name of the database from the Database: line eg. # # Database: @/home/psgendb/BIRCHDEV/dat/fasta/refseqgene.nam # We'll use this in GetOtherData which retrieves title and species info # for each hit, which FASTA doesn't put into the table report. # A user-defined database will not contain the '@' character. Set it to the file path # in that case if RawDBLine.find('@') > -1 : self.Database = RawDBLine.rpartition("/")[2].partition(".")[0] if self.Database == 'mouse_genome' : # SPECIAL CASE self.Database = 'GCF_000001635.26_top_level' elif self.Database == 'human_genome' : # SPECIAL CASE : self.Database = 'GCF_000001405.38_top_level' else : self.Database = "userfile" # Now read the rest of the file while i < len(lines) : if not lines[i].startswith("#") : fields = lines[i].split("\t") #We have to build a list of fields from each input line # Omit field 0, which is the query name # The first field is field[1], but we have to extract the # accession number from that field. datalist = [] if self.Database == "userfile" : datalist.append(fields[1]) else: # Subject field depends on which database you are searching. # If subject id line is in the form # gi|1464306813|ref|NC_038294.1| if '|' in fields[1] : datalist.append(fields[1].split("|")[3]) else : datalist.append(fields[1]) #Now, add on the rest fo the fields x = 2 while x < len(fields): datalist.append(fields[x].strip()) x += 1 #print(datalist) self.Data.append(datalist) i += 1 return # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # Add fields from dictionary D to Data def AddInfo(self,D): i = 0 while i < len(self.Data) : if self.Data[i][0] in D : j = 1 for f in D[self.Data[i][0]] : self.Data[i].insert(j,f) j += 1 i += 1 # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - def WriteTable(self,OFN): TAB = "\t" NL = "\n" outfile=open(OFN,"w") # Write header lines for l in self.header : outfile.write(l + NL) # Write column headings FieldLine = "#" + self.ColumnHeadings[0] if not self.Database == "userfile" : FieldLine = FieldLine + TAB + "title" + TAB + "species" + TAB + "taxid" for f in self.ColumnHeadings[1:] : FieldLine = FieldLine + TAB + f outfile.write(FieldLine + NL) # Write datalines for datalist in self.Data : # Construct an output line, and then write it to the output file outline = datalist[0] for f in datalist[1:] : outline = outline + TAB + f outfile.write(outline + NL) outfile.close() return # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - def RunBlastdbcmd(P,FT): """ Run blastdbcmd to retrieve title and species information for FASTA hits. FT - fasta table data returns a dictionary of lists, with data for each """ # First, create a temporary file listing accession numbers as input for blastdbcmd accfilename = P.PID + ".acc" accfile = open(accfilename,'w') for row in FT.Data : accfile.write(row[0] + "\n") accfile.close() # Run blastdbcmd to save the desired fields into a temporary output file OFN = P.PID + ".tmplist" p = subprocess.Popen(["blastdbcmd","-db", FT.Database, "-entry_batch",accfilename, "-outfmt", "%a|%t|%S|%T", "-out",OFN]) p.wait() # Import the blastcmd output into a dictionary keyed by accession number. # Strip off the whitespace (ie. \n) as each line is read. infodict = {} infofile = open(OFN,'r') lines = [] for l in infofile.readlines() : lines.append(l.strip()) infofile.close() for l in lines : fields = l.split("|") infodict[fields[0]] = fields[1:] # remove temp files os.remove(accfilename) os.remove(OFN) return infodict # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - def RUNTEXTEDIT(OFN): """ Run the texteditor in the background and remove the temporary file when done """ COMMAND = '(nohup `choose_edit_wrapper.sh` ' + OFN + '; $RM_CMD ' + OFN + ' > /dev/null)&' # It's surprising how many issues there are with launching multiple # files in a text editor. choose_edit_wrapper.sh takes care of # these issues. #COMMAND = '($BL_TextEditor ' + OFN + '; $RM_CMD ' + OFN + ')&' os.system(COMMAND) return # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - def RUNBROWSER(OFN,DELETEFILE): """ Run web browser in the background and remove the temporary file when done """ #COMMAND = '(nohup `chooseviewer.py` ' + OFN + '; $RM_CMD ' + OFN + ' > /dev/null)&' if DELETEFILE : p = subprocess.Popen(['chooseviewer.py', OFN, '--delete']) else : p = subprocess.Popen(['chooseviewer.py', OFN]) p.wait() #COMMAND = '(nohup $BL_Browser ' + OFN + '; $RM_CMD ' + OFN + ' > /dev/null)&' #os.system(COMMAND) return # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - def RunBioLegato(DESTINATION, FN): """ Run the blnfetch or blpfetch in the background and remove the temporary file when done """ COMMAND = '(nohup ' + DESTINATION + ' ' + FN + '; rm -f ' + FN + ' > /dev/null)&' os.system(COMMAND) return #======================== MAIN PROCEDURE ========================== def main(): """ Called when not in documentation mode. """ P = Parameters () # Write the output to a file, or send it to a window, as specified # in --destination if P.DESTINATION == 'textedit': RUNTEXTEDIT(P.IFN) elif P.DESTINATION == 'browser': RUNBROWSER(P.IFN,True) elif P.DESTINATION == 'textfile': shutil.copy2(P.IFN,P.OFN) elif P.DESTINATION == 'htmlfile': shutil.copy2(P.IFN,P.OFN) elif P.DESTINATION in ['blnfetch','blpfetch','tsvfile'] : FT = FastaTable() FT.ReadOutput(P.IFN) PARSEDOFN = P.PID + '.' + 'tsv' if not FT.Database == "userfile" : infodict = RunBlastdbcmd(P,FT) FT.AddInfo(infodict) FT.WriteTable(PARSEDOFN) if P.DESTINATION in ['blnfetch','blpfetch'] : RunBioLegato(P.DESTINATION,PARSEDOFN) else : os.rename(PARSEDOFN,P.OFN) # If --delete, delete the input file when done if P.DELETEFILE : os.remove(P.IFN) if "-test" in sys.argv: pass else: main()