#!/usr/bin/env python """ optparse is deprecated in favor of argparse as of Python 2.7. However, since 2.7 is not always present on many systems, at this writing, it is safer to stick with optparse for now. It should be easy to change later, since the syntax is very similar between argparse and optparse. from optparse import OptionParser """ from optparse import OptionParser import datetime import getpass import os import re import shutil import stat import subprocess import sys import time ''' bl_soapdenovo2.py - Run SOAPdenovo2 Synopsis: bl_soapdenovo2.py infile outdir prefix program kmer genomesize [SOAPdenov2 options] @modified: May 6, 2019 @author: Brian Fristensky @contact: Brian.Fristensky@umanitoba.ca ''' #blib = os.environ.get("BIRCHPYLIB") #sys.path.append(blib) #from birchlib import Birchmod PROGRAM = "bl_soapdenovo2.py : " USAGE = "\n\tUSAGE: bl_soapdenovo2.py infile outdir prefix program kmer genomesize [SOAPdenov2 options]" DEBUG = True if DEBUG : print('bl_soapdenovo2: Debugging mode on') #BM = Birchmod(PROGRAM, USAGE) # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - class Parameters: """ Wrapper class for command line parameters """ def __init__(self): """ Initializes arguments: INFILE = "" OUTDIR = "" PREFIX = "" PROGRAM = "SOAPdenovo-63mer" KMER = 23 GENOME = 20 SOAPARGS = [] Then calls read_args() to fill in their values from command line """ self.INFILE = "" self.OUTDIR = "" self.PREFIX = "" self.PROGRAM = "SOAPdenovo-63mer" self.KMER = 23 self.GENOME = 20 self.MAXREADLEN = 100 self.SOAPARGS = [] self.read_args() if DEBUG : print('------------ Parameters from command line ------') print(' INFILE: ' + self.INFILE) print(' OUTDIR: ' + self.OUTDIR) print(' PREFIX: ' + self.PREFIX) print(' PROGRAM: ' + self.PROGRAM) print(' KMER: ' + str(self.KMER)) print(' GENOME: ' + str(self.GENOME)) print(' SOAPARGS: ' + str(self.SOAPARGS)) print() def read_args(self): """ Read command line arguments into a Parameter object """ self.INFILE = sys.argv[1] self.OUTDIR = sys.argv[2] self.PREFIX = sys.argv[3] self.PROGRAM = sys.argv[4] self.KMER = int(sys.argv[5]) self.GENOME = int(sys.argv[6]) * 1000000 self.SOAPARGS = sys.argv[7:] # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - class ConfigLib : """ Library for a config fiile as described in SOAPdenovo2 documentation. """ def __init__(self): """ Initializes arguments: READPAIRS = [] PARAMS = [] """ self.READPAIRS = [] self.PARAMS = [] def ParseLib(self,LINE) : """ BioLegato writes all information for each lib on a single LIBLINE, which must be parsed. Each line is structured accordingly: ::= ^ ::= ,[|,] #ie. one or more comma-separated read pairs ::=

[|

]

::= = The pipe character is used as a separator between read pairs and between parameters. For example, Paired-end reads: leftreadfile.fq.gz,rightreadfile.fq.gz Single-end reads: reads1.fq.gz """ # Ugly workround for BioLegato bug that prevents recursive evaluation of variables # when there are several tabs containing multiple panels # In this case variables witn no value will get written as %%, so # we get rid of anything that matches that pattern. # When BioLegato is fixed, these lines can be deleted. pattern = re.compile('%\S+%') CleanLine = re.sub(pattern,'',LINE) # Get rid of quotes that might enclose the string, then split it into Filename and Parameter # parts for separate parsing. LTOKENS = CleanLine.replace('"','').split('^') FTOKENS = LTOKENS[0] PTOKENS = LTOKENS[1] # Parse Read Pairs PAIRS = FTOKENS.split('|') for P in PAIRS : tokens = P.split(',') # ignore blank fields. Add either single or pair of filenames # to list. Only process names from first two fields on a line # and ignore other fields. if len(tokens) > 0 : r1 = tokens[0].strip() if len(r1) > 0 : fnames = [r1] else : fnames = [] if len(tokens) > 1 : r2 = tokens[1].strip() if len(r2) > 0 : fnames.append(r2) if len(fnames) > 0 : self.READPAIRS.append(fnames) # Parse Parameters PARAMS = PTOKENS.split('|') for P in PARAMS : testP = P.strip() if len(testP) > 0 : self.PARAMS.append(testP) if DEBUG : print(str(self.READPAIRS)) print(str(self.PARAMS)) # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # Create config file for running SOAPdenovo2 def BuildConfig(P,Libs,CFN) : # Return a code indicating the type of file(s), based on file exension(s) def FileType(PR) : FT = "" Fastq = ['.fq','.fastq'] # There are too many file extensions that have been used for fasta files, # so we assume that it it's not a fastq file, it must be a fasta file. if len(PR) == 2 : Ext1 = os.path.splitext(PR[0])[1].lower() Ext2 = os.path.splitext(PR[1])[1].lower() if Ext1 in Fastq and Ext2 in Fastq : FT = 'qn' else : FT = 'fn' else : Ext1 = os.path.splitext(PR[0])[1].lower() if Ext1 in Fastq : FT = 'q' else : FT = 'f' return FT NL = '\n' CFILE = open(CFN,'w') CFILE.write('#===== SOAPdenovo2 Config file generated by bl_soapdenovo2.py =====' + NL) CFILE.write(NL) CFILE.write('# - - - - - GLOBAL PARAMETERS - - - - -' + NL) CFILE.write('max_rd_len=' + str(P.MAXREADLEN) + NL) CFILE.write(NL) for L in Libs : CFILE.write('[LIB]' + NL) # Write parameters CFILE.write('# - - - - - PARAMETERS - - - - -' + NL) for P in L.PARAMS : CFILE.write(P + NL) # Write read files CFILE.write('# - - - - - READ FILES - - - - -' + NL) for PR in L.READPAIRS : FT = FileType(PR) if FT == 'qn' : # paired fastq files CFILE.write('q1=' + PR[0] + NL) CFILE.write('q2=' + PR[1] + NL) elif FT == 'fn' : # paired fasta files CFILE.write('f1=' + PR[0] + NL) CFILE.write('f2=' + PR[1] + NL) elif FT == 'q' : # single fastq file CFILE.write('q=' + PR[0] + NL) elif FT == 'f' : # single fasta file CFILE.write('f=' + PR[0] + NL) CFILE.write(NL) CFILE.write(NL) CFILE.close() # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - def RunSOAPdenovo2(P,CFN,LOGFILE) : # k-mer must be an odd number (default 23) # for SOAPdenovo-63mer, 13 >= k-mer <= 63 # for SOAPdenovo-127mer, 13 >= k-mer <= 127 KMER = P.KMER if KMER % 2 == 0 : # modulo operator returns 0 if even KMER += 1 # force even number to next highest odd number if P.PROGRAM == "SOAPdenovo-63mer" : if KMER > 63 : KMER = 63 else : if KMER > 127 : KMER = 127 COMMAND=[P.PROGRAM] COMMAND.extend(["all", "-s", CFN, "-o", P.PREFIX, "-K", str(KMER), "-N", str(P.GENOME)]) COMMAND.extend(P.SOAPARGS) # Run the command - - - - - - - - - - - - - - - - - LOGFILE.write('======== SOAPdenovo2 ==========' + '\n') LOGFILE.write(str(COMMAND) + '\n') StartTime = datetime.datetime.now() LOGFILE.write('Start time: ' + str(StartTime) + '\n') LOGFILE.write('\n') LOGFILE.flush() p = subprocess.Popen(COMMAND,stdout=LOGFILE, stderr=LOGFILE) p.wait() FinishTime = datetime.datetime.now() LOGFILE.write('\n') LOGFILE.write('Finish time: ' + str(FinishTime) + '\n') ElapsedTime = FinishTime - StartTime LOGFILE.write('Elapsed time: ' + str(ElapsedTime) + '\n') #======================== MAIN PROCEDURE ========================== def main(): """ Called when not in documentation mode. """ # Read parameters from command line P = Parameters() # Read libraries from input file Libs = [] if not P.INFILE == "" : F = open(P.INFILE,"r") for line in F.readlines() : line = line.strip() if line != "" : NewLib = ConfigLib() NewLib.ParseLib(line) Libs.append(NewLib) OKAY = True # We'll be working in the output directory, so we need # absolute file paths to the input files. # While we're at it, we need to find out which read is the longest, # for setting max_rd_len, the only global varialble in config file. # The latter is important, because if we don't set max_rd_len, # SOAPdenovo2 will truncate reads in ALL libraries to 100 ! for L in Libs : for PR in L.READPAIRS : PR[0] = os.path.abspath(PR[0]) if len(PR) == 2 : PR[1] = os.path.abspath(PR[1]) for param in L.PARAMS : if param.startswith('rd_len_cutoff') : rd_len_cutoff = int(param.split('=')[1]) if rd_len_cutoff > P.MAXREADLEN : P.MAXREADLEN = rd_len_cutoff else : OKAY = False F.close() # From now on, everything happens in the output directory. if not os.path.exists(P.OUTDIR) : os.mkdir(P.OUTDIR) os.chdir(P.OUTDIR) LOGFN = os.path.join(P.OUTDIR, 'soapdenovo2.log') # First create the file, then re-open it so that we can append to it. # We first create with 'w' just to make sure that we're creating a fresh copy, # rather than appending to an existing copy from a previous run. LOGFILE = open(LOGFN,'w') LOGFILE.close() LOGFILE = open(LOGFN,'a') LOGFILE.write('\n') if not OKAY : LOGFILE.write('bl_soapdenovo2.py: >>> Need to specify a list of sequencing read files <<<' + '\n') # Create config file for running SOAPdenovo2 CFN = P.PREFIX + '.config' BuildConfig(P,Libs,CFN) # Run SOAPdenovo2 if OKAY : RunSOAPdenovo2(P,CFN,LOGFILE) LOGFILE.close() if __name__ == "__main__": main() #else: #used to generate documentation # import doctest # doctest.testmod() #if (BM.documentor() or "-test" in sys.argv): # pass #else: # main()