#!/usr/bin/env python ''' bl_seqreadlist.py - given files with names of readfiles, returns part of the spades command line which specifies read files Synopsis: bl_seqreadlist.py [--tsv tsvfile] [--pe pefile] [--mp pefile] [--long singlereadfile] [--s singlereadfile] \ --outtype spadescom --fullpaths [--outfile filename] --outtype Specifies format required by various probrams that work with reads. Currently supported values: spadescom, transratecom, polluxcom, abysscom, trinityp, trinitys @modified: April 22, 2019 @author: Brian Fristensky @contact: Brian.Fristensky@umanitoba.ca ''' """ optparse is deprecated in favor of argparse as of Python 2.7. However, since 2.7 is not always present on many systems, at this writing, it is safer to stick with optparse for now. It should be easy to change later, since the syntax is very similar between argparse and optparse. from optparse import OptionParser """ from optparse import OptionParser import os import re import sys PROGRAM = "bl_seqreadlist.py : " USAGE = "\n\tUSAGE: bl_seqreadlist.py [--tsv tsvfile] [--pe pefile] [--mp pefile] [--long singlereadfile] [--s singlereadfile]\ --outtype filetype --fullpaths [--outfile filename]" DEBUG = False #Must be false when run by BioLegato if DEBUG : print('bl_seqreadlist: Debugging mode on') # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - class Parameters: """ Wrapper class for command line parameters """ def __init__(self): """ Initializes arguments: TSVFILE = "" PEFILE = "" MPFILE = "" LONGFILE = "" SEFILE = "" OUTTYPE = "spadescom" FULLPATHS = False OUTFILE = "" """ self.TSVFILE = "" self.PEFILE = "" self.MPFILE = "" self.LONGFILE = "" self.SEFILE = "" self.OUTTYPE = "spadescom" self.FULLPATHS = False self.OUTFILE = "" self.read_args() if DEBUG : print('------------ Parameters from command line ------') print(' TSVFILE: ' + self.TSVFILE) print(' PEFILE: ' + self.PEFILE) print(' MPFILE: ' + self.MPFILE) print(' LONGFILE: ' + self.LONGFILE) print(' SEFILE: ' + self.SEFILE) print(' OUTTYPE: ' + self.OUTTYPE) print(' FULLPATHS: ' + str(self.FULLPATHS)) print(' OUTFILE: ' + self.OUTFILE) print('') def read_args(self): """ Read command line arguments into a Parameter object """ parser = OptionParser() parser.add_option("--tsv", dest="tsvfile", action="store", default="", help="TAB-separated value file with names of paired-end and or single-end read files") parser.add_option("--pe", dest="pefile", action="store", default="", help="file with names of paired-end read files") parser.add_option("--mp", dest="mpfile", action="store", default="", help="file with names of mate-pair read files") parser.add_option("--long", dest="longfile", action="store", default="", help="file with names of long read files") parser.add_option("--s", dest="sefile", action="store", default="", help="file with names of single read files") parser.add_option("--outtype", dest="outtype", action="store", default="spadescom", help="type of output") parser.add_option("--fullpaths", dest="fullpaths", action="store_true", default=False, help="output full paths to files") parser.add_option("--outfile", dest="outfile", action="store", default="", help="output file") (options, args) = parser.parse_args() self.TSVFILE = options.tsvfile self.PEFILE = options.pefile self.MPFILE = options.mpfile self.LONGFILE = options.longfile self.SEFILE = options.sefile self.OUTTYPE = options.outtype self.FULLPATHS = options.fullpaths self.OUTFILE = options.outfile class TSVFiles : """ Methods for reading lists of paired read TSV files, and for writing lists to output. """ def __init__(self): """ Initializes arguments: READPAIRS = [] """ self.READPAIRS = [] def ReadTSVfile(self,FN,FULLPATHS) : """ TSV file containing names of paired-end and/or single end read files. Paired-end files are on lines such as leftreadfile.fq.gzrightreadfile.fq.gz Single-end files have a each file on a separate line reads1.fq.gz reads2.fq.gz reads3.fq.gz """ TAB = '\t' F = open(FN,"r") for line in F.readlines() : line = line.strip() if len(line) > 0 and not line.startswith('#') : # get rid of double quotes that enclose fields when some programs write # output, and then split by TABs. tokens = line.replace('"','').split(TAB) # ignore blank fields. Add either single or pair of filenames # to list. Only process names from first two fields on a line # and ignore other fields. if len(tokens) > 0 : r1 = tokens[0].strip() if len(r1) > 0 : fnames = [r1] else : fnames = [] if len(tokens) > 1 : r2 = tokens[1].strip() if len(r2) > 0 : fnames.append(r2) if len(fnames) > 0 : self.READPAIRS.append(fnames) if FULLPATHS : N = len(self.READPAIRS) I = 0 while I < N : M = len(self.READPAIRS[I]) J = 0 while J < M : self.READPAIRS[I][J] = os.path.abspath(self.READPAIRS[I][J]) J+=1 I+=1 if DEBUG : print(str(self.READPAIRS)) F.close() class PairedFiles : """ Methods for reading lists of paired read files, and for writing lists to output. """ def __init__(self): """ Initializes arguments: READPAIRS = [] """ self.READPAIRS = [] def ReadPEfile(self,FN,FULLPATHS) : """ Assumes all data is on a single line of the form: left1,right1|left2,right2|left3,right3 where the pipe character separates pairs of filenames, and left and right read pair files are separated by commas. """ F = open(FN,"r") line = F.readline() # assumes all data is on a single line tokens1 = line.split('|') for t1 in tokens1 : tokens2 = t1.split(',') tokens2[0] = tokens2[0].strip() tokens2[1] = tokens2[1].strip() if len(tokens2[0]) > 0 and len(tokens2[1]) > 0 : self.READPAIRS.append(tokens2) if FULLPATHS : N = len(self.READPAIRS) I = 0 while I < N : M = len(self.READPAIRS[I]) J = 0 while J < M : self.READPAIRS[I][J] = os.path.abspath(self.READPAIRS[I][J]) J+=1 I+=1 if DEBUG : print(str(self.READPAIRS)) F.close() class SingleReadFiles : """ Methods for reading lists of single read files, and for writing lists to output. """ def __init__(self): """ Initializes arguments: SREADFILES = [] """ self.SREADFILES = [] def ReadSfile(self,FN,FULLPATHS) : """ Assumes all data is on a single line of the form: file1|file2|file3... where the pipe character separates filenames """ F = open(FN,"r") line = F.readline() # assumes all data is on a single line tokens1 = line.split('|') for t1 in tokens1 : fname = t1.strip() if len(fname) > 0 : if FULLPATHS : fname = os.path.abspath(fname) self.SREADFILES.append(fname) if DEBUG : print(str(self.SREADFILES)) F.close() def WriteSpadesCom(TF,PF,SF) : """ Write output to be included in the Spades command line. """ COMSTR="" if len(TF.READPAIRS) > 0 : for P in TF.READPAIRS : if len(P) == 1 : COMSTR = COMSTR + ' -s ' + P[0] + ' ' else: COMSTR = COMSTR + ' -1 ' + P[0] + ' -2 ' + P[1] if len(PF.READPAIRS) > 0 : #NUM = 1 for P in PF.READPAIRS : # The documentation says this syntax should work, but it doesn't #COMSTR = COMSTR + ' --pe' + str(NUM) + '-1 ' + P[0] + ' --pe' + str(NUM) + '-2 ' + P[1] # Use this instead COMSTR = COMSTR + ' -1 ' + P[0] + ' -2 ' + P[1] #NUM += 1 if len(SF.SREADFILES) > 0 : for P in SF.SREADFILES : COMSTR = COMSTR + ' -s ' + P + ' ' print(COMSTR) def WriteAbyssCom(TF,PF,MP,LF,SF,OFN) : """ Write output to be included in the Abyss command line. """ # Create a part of a string for a given tag eg. lib, pe, long, se def MakePhrase(PAIRS,TAG) : PHRASE="" if len(PAIRS) > 0 : TAGLIST = TAG + "='" # used for writing list of libraries used in assembly eg. lib1, lib2... READLIST="" N=0 # number of paired libraries for P in PAIRS : N+=1 TAGLIST = TAGLIST + TAG + str(N) + ' ' # eg lib='lib1 lib2 lib3' LSTR = TAG + str(N) + "='" # eg. lib1='exp1_L.fq exp1_R.fq' if TAG in ['long','se'] : LSTR = LSTR + P + ' ' else: LSTR = LSTR + P[0] + ' ' + P[1] LSTR = LSTR + "' " READLIST += LSTR TAGLIST = TAGLIST + "' " PHRASE = TAGLIST + READLIST return PHRASE COMSTR = MakePhrase(TF.READPAIRS,'lib') COMSTR += MakePhrase(PF.READPAIRS,'pe') COMSTR += MakePhrase(MP.READPAIRS,'mp') COMSTR += MakePhrase(LF.SREADFILES,'long') COMSTR += MakePhrase(SF.SREADFILES,'se') if OFN == "" : print(COMSTR) else : OUTFILE=open(OFN,'w') OUTFILE.write(COMSTR) OUTFILE.close() def WriteTransrateCom(TF,PF) : """ Write output to be included in the Transrate command line. Transrate ONLY works with paired-end reads, as of v1.03 """ LEFTREADS = "" RIGHTREADS = "" COMSTR="" def AddReads(RP,J) : CL = "" I = 0 LEN = len(RP) while I < LEN : CL = CL + ',' + RP[I][J] I += 1 return CL if len(TF.READPAIRS) > 0 : LEFTREADS = LEFTREADS + AddReads(TF.READPAIRS,0) RIGHTREADS = RIGHTREADS + AddReads(TF.READPAIRS,1) if len(PF.READPAIRS) > 0 : LEFTREADS = LEFTREADS + AddReads(PF.READPAIRS,0) RIGHTREADS = RIGHTREADS + AddReads(PF.READPAIRS,1) # AddReads adds a comma before adding every read. We don't want a leading comma, # so if there is one, left truncate the read list to remove it. This makes the # logic above WAY simpler, than to make the beginning of the left and right # read lists a special case. if LEFTREADS.startswith(',') : LEFTREADS = LEFTREADS[1:] if RIGHTREADS.startswith(',') : RIGHTREADS = RIGHTREADS[1:] COMSTR = '--left=' + LEFTREADS + ' --right=' + RIGHTREADS print(COMSTR) def WritePolluxCom(TF,PF,SF) : """ Write output to be included in the Spades command line. """ if len(TF.READPAIRS) > 0 : for P in TF.READPAIRS : if len(P) == 1 : COMSTR = ' -i ' + P[0] else: COMSTR = ' -i ' + P[0] + ' ' + P[1] print(COMSTR) if len(PF.READPAIRS) > 0 : for P in PF.READPAIRS : COMSTR = ' -i ' + P[0] + ' ' + P[1] print(COMSTR) if len(SF.SREADFILES) > 0 : for P in SF.SREADFILES : COMSTR = ' -i ' + P print(COMSTR) def WriteTrinityCom(TF,OUTTYPE) : """ Write output to be included in the Trinity command line. """ COMSTR="" if len(TF.READPAIRS) > 0 : if OUTTYPE == 'trinityp' : # paired reads LEFT = "" RIGHT = "" for P in TF.READPAIRS : if len(P) == 2 : LEFT = LEFT + ',' + P[0] RIGHT = RIGHT + ',' + P[1] COMSTR = '--left ' + LEFT[1:] + ' --right ' + RIGHT[1:] else : # single reads SINGLE = "" for P in TF.READPAIRS : if len(P) == 1 : SINGLE = SINGLE + ',' + P[0] COMSTR = '--single ' + SINGLE[1:] print(COMSTR) #======================== MAIN PROCEDURE ========================== def main(): """ Called when not in documentation mode. """ # Read parameters from command line P = Parameters() # Read paired-end files used for sequence assembly TF = TSVFiles() if not P.TSVFILE == "" : TF.ReadTSVfile(P.TSVFILE,P.FULLPATHS) # Read additional read files used for scaffolding PF = PairedFiles() if not P.PEFILE == "" : PF.ReadPEfile(P.PEFILE,P.FULLPATHS) MF = PairedFiles() if not P.MPFILE == "" : MF.ReadPEfile(P.MPFILE,P.FULLPATHS) LF = SingleReadFiles() if not P.LONGFILE == "" : LF.ReadSfile(P.LONGFILE,P.FULLPATHS) SF = SingleReadFiles() if not P.SEFILE == "" : SF.ReadSfile(P.SEFILE,P.FULLPATHS) # Write comand line output in the specified format. if P.OUTTYPE == 'spadescom' : WriteSpadesCom(TF,PF,SF) elif P.OUTTYPE == 'abysscom' : WriteAbyssCom(TF,PF,MF,LF,SF,P.OUTFILE) elif P.OUTTYPE == 'transratecom' : WriteTransrateCom(TF,PF) elif P.OUTTYPE == 'polluxcom' : WritePolluxCom(TF,PF,SF) elif P.OUTTYPE in ['trinityp','trinitys'] : WriteTrinityCom(TF,P.OUTTYPE) else: pass if __name__ == "__main__": main()