#!/usr/bin/env python3 ''' bl_seqkit_fq2fa.py - given files with names of readfiles, create fasta files using a specified file extension. Synopsis: bl_seqkit_fq2fa.py --tsv tsvfile [--ext file_extension] Description: Runs seqkit fq2fa to create files fasta files. The names or paired-end read files are given as two tab-separated fields on tsvfile, while only one field is given for files with single-end reads. To make sure that both left and right reads for the same fragment are written to the output, seqkit sample is run using the same random seed for both read files. @modified: April 5, 2021 @author: Brian Fristensky @contact: Brian.Fristensky@umanitoba.ca ''' """ optparse is deprecated in favor of argparse as of Python 2.7. However, since 2.7 is not always present on many systems, at this writing, it is safer to stick with optparse for now. It should be easy to change later, since the syntax is very similar between argparse and optparse. from optparse import OptionParser """ from optparse import OptionParser import os import subprocess import sys PROGRAM = "bl_seqkit_fq2fa.py : " USAGE = "\n\tUSAGE: bl_seqkit_fq2fa.py --tsv tsvfile --ext file_extension" DEBUG = True if DEBUG : print('bl_seqkit_sample: Debugging mode on') # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - class Parameters: """ Wrapper class for command line parameters """ def __init__(self): """ Initializes arguments: TSVFILE = "" EXT = "" """ self.TSVFILE = "" self.EXT = "fsn" self.read_args() if DEBUG : print('------------ Parameters from command line ------') print(' TSVFILE: ' + self.TSVFILE) print(' EXT: ' + self.EXT) print('') def read_args(self): """ Read command line arguments into a Parameter object """ parser = OptionParser() parser.add_option("--tsv", dest="tsvfile", action="store", default="", help="TAB-separated value file of file names") parser.add_option("--ext", dest="ext", action="store", default="fsn", help="file extension of input files") (options, args) = parser.parse_args() self.TSVFILE = options.tsvfile self.EXT = options.ext if self.EXT.startswith(".") : self.EXT = self.EXT[1:] class TSVFiles : """ Methods for reading lists of paired read TSV files, and for writing lists to output. """ def __init__(self): """ Initializes arguments: READPAIRS = [] """ self.READPAIRS = [] def ReadTSVfile(self,FN) : """ TSV file containing names of paired-end and/or single end read files. Paired-end files are on lines such as leftreadfile.fq.gzrightreadfile.fq.gz Single-end files have a each file on a separate line reads1.fq.gz reads2.fq.gz reads3.fq.gz """ TAB = '\t' F = open(FN,"r") for line in F.readlines() : line = line.strip() if len(line) > 0 and not line.startswith('#') : # get rid of double quotes that enclose fields when some programs write # output, and then split by TABs. tokens = line.replace('"','').split(TAB) # ignore blank fields. Add either single or pair of filenames # to list. Only process names from first two fields on a line # and ignore other fields. if len(tokens) > 0 : r1 = tokens[0].strip() if len(r1) > 0 : fnames = [r1] else : fnames = [] if len(tokens) > 1 : r2 = tokens[1].strip() if len(r2) > 0 : fnames.append(r2) if len(fnames) > 0 : self.READPAIRS.append(fnames) if DEBUG : print(str(self.READPAIRS)) F.close() # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - def CreateFastaFiles(FNAMES,EXT) : def WriteFastaFile(FN,OutName) : # Run seqkit sample to generate the sample file COMSTR = ['seqkit','fq2fa', '-o', OutName, FN] #print(str(COMSTR)) p = subprocess.Popen(COMSTR) p.wait() def FastaName(FN,EXT) : if FN.endswith(".gz") : Trunc1 = FN[:-3] GZ = ".gz" else : Trunc1 = FN GZ = "" if Trunc1.endswith(".fq") : Trunc2 = Trunc1[:-3] elif Trunc1.endswith(".fastq") : Trunc2 = Trunc1[:-6] else : Trunc2 = Trunc1 ON = Trunc2 + "." + EXT + GZ return ON # . . . . . . . . . . . . . . for F in FNAMES : OutName = FastaName(F,EXT) WriteFastaFile(F,OutName) #======================== MAIN PROCEDURE ========================== def main(): """ Called when not in documentation mode. """ # Read parameters from command line P = Parameters() # Read paired-end files used for sequence assembly TF = TSVFiles() if not P.TSVFILE == "" : TF.ReadTSVfile(P.TSVFILE) # For each pair of names, run seqkit sample to generate sample files. for FNAMES in TF.READPAIRS : CreateFastaFiles(FNAMES,P.EXT) if __name__ == "__main__": main()