#!/usr/bin/env python3 ''' bl_hisat2.py - Given a series of paired-end read files, run hisat2 for each pair of files, and generate sorted BAM files for each pair. Synopsis: bl_hisat2.py tsvfile threads ifile outdir [hisat2 options] For each pair of read files (ie. left and right reads) in tsvfile, create a name for the BAM file to be generated from each pair of read files. The name of the BAM file is the longest leading string in common between the two filenames, followed by the .sam extension. Thus, control1_R1.fastq.gzcontrol1_R2.fastq.gz would output to control1_R.bam tsvfile - a tab-separated value file with each pair of filenames on separate lines MUST be the first argument. All hisat2 arguments follow. ifile - a genome index file produced by hisat2-build with an extension in the form .X.ht2, where X X is a digit from 1-9. This will be used to create the hisat2 option -x as described in the hisat2 documentation. outdir - director for writing .bam files The workflow runs hisat2 first to generate temporary .sam files, and then runs samtools sort to sort the files, which is required by stringtie or cufflinks. [hisat2 options] - options to be passed to hisat2 @modified: June 1, 2021 @author: Brian Fristensky @contact: Brian.Fristensky@umanitoba.ca ''' """ optparse is deprecated in favor of argparse as of Python 2.7. However, since 2.7 is not always present on many systems, at this writing, it is safer to stick with optparse for now. It should be easy to change later, since the syntax is very similar between argparse and optparse. from optparse import OptionParser """ from optparse import OptionParser import os import re import stat import subprocess import sys PROGRAM = "bl_hisat2.py : " USAGE = "\n\tUSAGE: bl_hisat2.py tsvfile threads ifile outdir [hisat2 options] " DEBUG = False if DEBUG : print('bl_hisat2: Debugging mode on') # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - class Parameters: """ Wrapper class for command line parameters """ def __init__(self): """ Initializes arguments: TSVFILE = "" THREADS = "2" IFILE = "" OUTDIR = "bamfiles" hisat2args = [] Then calls read_args() to fill in their values from command line """ self.TSVFILE = "" self.THREADS = "2" self.IFILE = "" self.OUTDIR = "bamfiles" self.hisat2args = [] self.read_args() if DEBUG : print('------------ Parameters from command line ------') print(' TSVFILE: ' + self.TSVFILE) print(' THREADS: ' + str(self.THREADS)) print(' IFILE: ' + self.IFILE) print(' OUTDIR: ' + self.OUTDIR) print(' hisat2args: ' + str(self.hisat2args)) print() def read_args(self): """ Read command line arguments into a Parameter object """ self.TSVFILE = sys.argv[1] self.THREADS = sys.argv[2] self.IFILE = sys.argv[3] self.OUTDIR = sys.argv[4] self.hisat2args = sys.argv[5:] # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - class TSVFiles : """ Methods for reading lists of paired read TSV files, and for writing lists to output. """ def __init__(self): """ Initializes arguments: READPAIRS = [] """ self.READPAIRS = [] def ReadTSVfile(self,FN) : """ TSV file containing names of paired-end and/or single end read files. Paired-end files are on lines such as leftreadfile.fq.gzrightreadfile.fq.gz Single-end files have a each file on a separate line reads1.fq.gz reads2.fq.gz reads3.fq.gz """ TAB = '\t' F = open(FN,"r") for line in F.readlines() : line = line.strip() if len(line) > 0 and not line.startswith('#') : # get rid of double quotes that enclose fields when some programs write # output, and then split by TABs. tokens = line.replace('"','').split(TAB) # ignore blank fields. Add either single or pair of filenames # to list. Only process names from first two fields on a line # and ignore other fields. if len(tokens) > 0 : r1 = tokens[0].strip() if len(r1) > 0 : fnames = [r1] else : fnames = [] if len(tokens) > 1 : r2 = tokens[1].strip() if len(r2) > 0 : fnames.append(r2) if len(fnames) > 0 : self.READPAIRS.append(fnames) if DEBUG : print(str(self.READPAIRS)) F.close() # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - def RunHisat2(PR,THREADS,IFILE,OUTDIR,HISAT2ARGS,LOGFILE) : # Create a basename for the hisat2 -x option by # removing the .X.ht2 file extension from IFILE # Assumes the basename does not contain a dot (.) def TruncPath(IFILE) : TP = IFILE.split('.')[0] return TP # Return the longest identical substring in common # between s1 and s2, reading from left to right. def FirstDiff(s1,s2) : I = 0 MINLEN=min(len(s1),len(s2)) DONE = False while s1[I] == s2[I] and I < MINLEN : I +=1 return s1[:I] # Construct the command string - - - - - - - - - - - - - - # Dummy command for testing #COMSTR=["hisat2","--help"] COMSTR=["hisat2"] # Add the -x option, specifying the basename for the index files. XPATH = TruncPath(IFILE) COMSTR.extend(['-x',XPATH]) # Add the names of the left and right read files if len(PR) > 0 : if len(PR) == 1 : COMSTR.extend(['-s', PR[0], ' ']) else: COMSTR.extend(['-1', PR[0], '-2', PR[1]]) # Find a common basename for the SAM file, and add the .sam filename # to the command ComName = OUTDIR + os.sep + os.path.basename(FirstDiff(PR[0],PR[1])) if DEBUG : print('ComName: ' + ComName) TmpSamName= '_tmp' + '.sam' # temporary file with unsorted output from hisat2 COMSTR.extend(['-S',TmpSamName]) # Append the hisat2 options to the command COMSTR.extend(['-p', THREADS]) COMSTR.extend(HISAT2ARGS) if DEBUG : print('COMSTR: ' + str(COMSTR)) # Run Hisat2 - - - - - - - - - - - - - - - - - LOGFILE.write('======== hisat2 ==========' + '\n') LOGFILE.write(str(COMSTR) + '\n') LOGFILE.write('\n') LOGFILE.flush() p = subprocess.Popen(COMSTR,stdout=LOGFILE,stderr=LOGFILE) p.wait() LOGFILE.write('\n') # Run samtools sort to sort the bam file - - - - - - - - - - - - - - - - - BamName= ComName + '.bam' # softed output for use with cufflinks COMSTR=['samtools','sort','--threads', THREADS] COMSTR.extend(['-o', BamName, TmpSamName]) LOGFILE.write('======== samtools sort ==========' + '\n') LOGFILE.write(str(COMSTR) + '\n') LOGFILE.write('\n') LOGFILE.flush() p = subprocess.Popen(COMSTR,stdout=LOGFILE,stderr=LOGFILE) p.wait() # Run samtools stats to create statistics for the bam file - - - - - - - - - - - - CLIST=['samtools','stats','--threads', THREADS, BamName] StatName = BamName + ".stats" CLIST.extend(['>', StatName]) LOGFILE.write('======== samtools stats ==========' + '\n') LOGFILE.write(str(CLIST) + '\n') LOGFILE.write('\n') LOGFILE.flush() COMMAND = " ".join(CLIST) print(COMMAND) os.system(COMMAND) # Delete temporary sam file os.remove(TmpSamName) LOGFILE.write('\n') #======================== MAIN PROCEDURE ========================== def main(): """ Called when not in documentation mode. """ # Read parameters from command line P = Parameters() TF = TSVFiles() if not P.TSVFILE == "" : TF.ReadTSVfile(P.TSVFILE) # Make sure that the output directory exists if not os.path.isdir(P.OUTDIR) : os.mkdir(P.OUTDIR) LOGFN = os.path.join(P.OUTDIR,'bl_hisat2.log') LOGFILE = open(LOGFN,'w') LOGFILE.write('\n') # Run hisat2 for PR in TF.READPAIRS : RunHisat2(PR,P.THREADS,P.IFILE,P.OUTDIR,P.hisat2args,LOGFILE) LOGFILE.close() if __name__ == "__main__": main() #else: #used to generate documentation # import doctest # doctest.testmod() #if (BM.documentor() or "-test" in sys.argv): # pass #else: # main()