#!/usr/bin/env python ''' bl_seqkit_sample.py - given files with names of readfiles, create files containing a randomly chosen sample of reads Synopsis: bl_seqkit_sample.py --tsv tsvfile --ext file_extension --prefix string --percent integer @modified: January 15, 2019 @author: Brian Fristensky @contact: Brian.Fristensky@umanitoba.ca ''' """ optparse is deprecated in favor of argparse as of Python 2.7. However, since 2.7 is not always present on many systems, at this writing, it is safer to stick with optparse for now. It should be easy to change later, since the syntax is very similar between argparse and optparse. from optparse import OptionParser """ from optparse import OptionParser import os import random import re import subprocess import sys PROGRAM = "bl_seqkit_sample.py : " USAGE = "\n\tUSAGE: bl_seqkit_sample.py --tsv tsvfile --ext file_extension --prefix string --percent integer" DEBUG = True #Must be false when run by BioLegato if DEBUG : print('bl_seqkit_sample: Debugging mode on') # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - class Parameters: """ Wrapper class for command line parameters """ def __init__(self): """ Initializes arguments: TSVFILE = "" EXT = "" PREFIX = "" PERCENT = "" """ self.TSVFILE = "" self.EXT = "" self.PREFIX = "" self.PERCENT = 5 self.read_args() if DEBUG : print('------------ Parameters from command line ------') print(' TSVFILE: ' + self.TSVFILE) print(' EXT: ' + self.EXT) print(' PREFIX: ' + self.PREFIX) print(' PERCENT: ' + str(self.PERCENT)) print('') def read_args(self): """ Read command line arguments into a Parameter object """ parser = OptionParser() parser.add_option("--tsv", dest="tsvfile", action="store", default="", help="TAB-separated value file of file names") parser.add_option("--ext", dest="ext", action="store", default="", help="file extension of input files") parser.add_option("--prefix", dest="prefix", action="store", default="", help="prefix to add to file extension") parser.add_option("--percent", dest="percent", action="store", default=5, help="percent of original file to include in the sample") (options, args) = parser.parse_args() self.TSVFILE = options.tsvfile self.EXT = options.ext self.PREFIX = options.prefix self.PERCENT = int(options.percent) class TSVFiles : """ Methods for reading lists of paired read TSV files, and for writing lists to output. """ def __init__(self): """ Initializes arguments: READPAIRS = [] """ self.FILENAMES = [] def ReadTSVfile(self,FN) : """ Each file is on a separate line reads1.fq.gz reads2.fq.gz reads3.fq.gz """ TAB = '\t' F = open(FN,"r") for line in F.readlines() : line = line.strip() if len(line) > 0 and not line.startswith('#') : # get rid of double quotes that enclose fields when some programs write # output, and then split by TABs. tokens = line.replace('"','').split(TAB) # ignore blank fields. Add either single or pair of filenames # to list. Only process names from first two fields on a line # and ignore other fields. if len(tokens) > 0 : fname = tokens[0].strip() self.FILENAMES.append(fname) if DEBUG : print(str(self.FILENAMES)) F.close() # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - def CreateSampleFile(FN,EXT,PREFIX,PERCENT) : if FN.endswith(EXT) : # Parse the filename into basename and extension BaseName = FN[0:FN.find(EXT)] # Create an output filename consisting of OutName = BaseName + PREFIX + EXT # Generate parameters Proportion = PERCENT/100.0 Seed = random.randint(-32767,32767) # Run seqkit sample to generate the sample file COMSTR = ['seqkit','sample','--proportion',str(Proportion),'--rand-seed', str(Seed), '-o', OutName, FN] p = subprocess.Popen(COMSTR) p.wait() #======================== MAIN PROCEDURE ========================== def main(): """ Called when not in documentation mode. """ # Read parameters from command line P = Parameters() # Read paired-end files used for sequence assembly TF = TSVFiles() if not P.TSVFILE == "" : TF.ReadTSVfile(P.TSVFILE) # For each file, write a sample file with the specified percentage of the # original file. for F in TF.FILENAMES : CreateSampleFile(F,P.EXT,P.PREFIX,P.PERCENT) if __name__ == "__main__": main()