#!/usr/bin/env python3 ''' bl_seqkit_sample.py - given files with names of readfiles, create files containing a randomly chosen sample of reads Synopsis: bl_seqkit_sample.py --tsv tsvfile --ext file_extension --prefix string --percent integer Description: Runs seqkit sample to create files containing randomly chosensubsets of a large read set. The names or paired-end read files are given as two tab-separated fields on tsvfile, while only one field is given for files with single-end reads. To make sure that both left and right reads for the same fragment are written to the output, seqkit sample is run using the same random seed for both read files. @modified: April 5, 2021 @author: Brian Fristensky @contact: Brian.Fristensky@umanitoba.ca ''' """ optparse is deprecated in favor of argparse as of Python 2.7. However, since 2.7 is not always present on many systems, at this writing, it is safer to stick with optparse for now. It should be easy to change later, since the syntax is very similar between argparse and optparse. from optparse import OptionParser """ from optparse import OptionParser import os import random import re import subprocess import sys PROGRAM = "bl_seqkit_sample.py : " USAGE = "\n\tUSAGE: bl_seqkit_sample.py --tsv tsvfile --ext file_extension --prefix string --percent integer --threads integer" DEBUG = False #Must be False when run by BioLegato if DEBUG : print('bl_seqkit_sample: Debugging mode on') # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - class Parameters: """ Wrapper class for command line parameters """ def __init__(self): """ Initializes arguments: TSVFILE = "" EXT = "" PREFIX = "" PERCENT = "" """ self.TSVFILE = "" self.EXT = "" self.PREFIX = "" self.PERCENT = 5 self.THREADS = 1 self.read_args() if DEBUG : print('------------ Parameters from command line ------') print(' TSVFILE: ' + self.TSVFILE) print(' EXT: ' + self.EXT) print(' PREFIX: ' + self.PREFIX) print(' PERCENT: ' + str(self.PERCENT)) print(' THREADS: ' + str(self.THREADS)) print('') def read_args(self): """ Read command line arguments into a Parameter object """ parser = OptionParser() parser.add_option("--tsv", dest="tsvfile", action="store", default="", help="TAB-separated value file of file names") parser.add_option("--ext", dest="ext", action="store", default="", help="file extension of input files") parser.add_option("--prefix", dest="prefix", action="store", default="", help="prefix to add to file extension") parser.add_option("--percent", dest="percent", action="store", default=5, help="percent of original file to include in the sample") parser.add_option("--threads", "-j", dest="threads", action="store", default=1, help="# CPUs to use") (options, args) = parser.parse_args() self.TSVFILE = options.tsvfile self.EXT = options.ext self.PREFIX = options.prefix self.PERCENT = int(options.percent) self.THREADS = int(options.threads) class TSVFiles : """ Methods for reading lists of paired read TSV files, and for writing lists to output. """ def __init__(self): """ Initializes arguments: READPAIRS = [] """ self.READPAIRS = [] def ReadTSVfile(self,FN) : """ TSV file containing names of paired-end and/or single end read files. Paired-end files are on lines such as leftreadfile.fq.gzrightreadfile.fq.gz Single-end files have a each file on a separate line reads1.fq.gz reads2.fq.gz reads3.fq.gz """ TAB = '\t' F = open(FN,"r") for line in F.readlines() : line = line.strip() if len(line) > 0 and not line.startswith('#') : # get rid of double quotes that enclose fields when some programs write # output, and then split by TABs. tokens = line.replace('"','').split(TAB) # ignore blank fields. Add either single or pair of filenames # to list. Only process names from first two fields on a line # and ignore other fields. if len(tokens) > 0 : r1 = tokens[0].strip() if len(r1) > 0 : fnames = [r1] else : fnames = [] if len(tokens) > 1 : r2 = tokens[1].strip() if len(r2) > 0 : fnames.append(r2) if len(fnames) > 0 : self.READPAIRS.append(fnames) if DEBUG : print(str(self.READPAIRS)) F.close() # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - def CreateSampleFiles(FNAMES,EXT,PREFIX,PERCENT,THREADS) : def WriteSampleFile(FN,OutName) : print(" running WriteSampleFile") # Run seqkit sample to generate the sample file COMSTR = ['seqkit','sample','--proportion',Proportion,'--rand-seed', Seed, '--threads', str(THREADS), '-o', OutName, FN] print(str(COMSTR)) p = subprocess.Popen(COMSTR) p.wait() # . . . . . . . . . . . . . . print(" running CreateSampleFiles") # Generate parameters Proportion = str(PERCENT/100.0) Seed = str(random.randint(-32767,32767)) # single-end file if len(FNAMES) == 1 : FN1 = FNAMES[0] if FN1.endswith(EXT) : # Parse the filename into basename and extension BaseName1 = FN1[0:FN1.find(EXT)] # Create an output filename consisting of OutName1 = BaseName1 + PREFIX + EXT # Run seqkit WriteSampleFile(FN1,OutName1) # paired-end files else : FN1 = FNAMES[0] FN2 = FNAMES[1] if ( FN1.endswith(EXT) and FN2.endswith(EXT) ) : # Parse the filename into basename and extension BaseName1 = FN1[0:FN1.find(EXT)] BaseName2 = FN2[0:FN2.find(EXT)] # Create an output filename consisting of OutName1 = BaseName1 + PREFIX + EXT OutName2 = BaseName2 + PREFIX + EXT # Run seqkit WriteSampleFile(FN1,OutName1) WriteSampleFile(FN2,OutName2) #======================== MAIN PROCEDURE ========================== def main(): """ Called when not in documentation mode. """ print("running bl_seqkit_sample.py") # Read parameters from command line P = Parameters() # Read paired-end files used for sequence assembly TF = TSVFiles() if not P.TSVFILE == "" : TF.ReadTSVfile(P.TSVFILE) # For each pair of names, run seqkit sample to generate sample files. for FNAMES in TF.READPAIRS : print("FNAMES: " + str(FNAMES)) CreateSampleFiles(FNAMES,P.EXT,P.PREFIX,P.PERCENT,P.THREADS) if __name__ == "__main__": main()