#!/usr/bin/env python3

'''
bl_seqkit_sample.py - given files with names of readfiles, create files containing a randomly chosen sample of reads

Synopsis: bl_seqkit_sample.py --tsv tsvfile --ext file_extension --prefix string --percent integer

Description: Runs seqkit sample to create files containing randomly chosensubsets of a large read set. The names or paired-end read files are given as two tab-separated fields on tsvfile, while only one field is given for files with single-end reads. To make sure that both left and right reads for the same fragment are written to the output, seqkit sample is run using the same random seed for both read files. 

@modified: April 5, 2021
@author: Brian Fristensky
@contact: Brian.Fristensky@umanitoba.ca  
'''

"""
optparse is deprecated in favor of argparse as of Python 2.7. However,
 since 2.7 is not always present on many systems, at this writing,
 it is safer to stick with optparse for now. It should be easy
 to change later, since the syntax is very similar between argparse and optparse.
 from optparse import OptionParser
"""
from optparse import OptionParser

import os
import random
import re
import subprocess
import sys


PROGRAM = "bl_seqkit_sample.py : "
USAGE = "\n\tUSAGE: bl_seqkit_sample.py --tsv tsvfile --ext file_extension --prefix string --percent integer --threads integer"

DEBUG = False #Must be False when run by BioLegato
if DEBUG :
    print('bl_seqkit_sample: Debugging mode on')


# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
class Parameters:
    """
      	Wrapper class for command line parameters
      	"""
    def __init__(self):
        """
     	  Initializes arguments:
                TSVFILE = ""
                EXT = ""
                PREFIX = ""
                PERCENT = ""

          """
        self.TSVFILE = ""
        self.EXT = ""
        self.PREFIX = ""
        self.PERCENT = 5  
        self.THREADS = 1       
        self.read_args()


        if DEBUG :
            print('------------ Parameters from command line ------')
            print('    TSVFILE: ' + self.TSVFILE)
            print('    EXT: ' + self.EXT)
            print('    PREFIX: ' + self.PREFIX)
            print('    PERCENT: ' + str(self.PERCENT))
            print('    THREADS: ' + str(self.THREADS))
            print('')  

    def read_args(self):
        """
        	Read command line arguments into a Parameter object
    	"""
            
        parser = OptionParser()
        parser.add_option("--tsv", dest="tsvfile", action="store", default="",
                          help="TAB-separated value file of file names")
        parser.add_option("--ext", dest="ext", action="store", default="",
                          help="file extension of input files")
        parser.add_option("--prefix", dest="prefix", action="store", default="",
                          help="prefix to add to file extension")
        parser.add_option("--percent", dest="percent", action="store", default=5,
                          help="percent of original file to include in the sample")
        parser.add_option("--threads", "-j", dest="threads", action="store", default=1,
                          help="# CPUs to use")
       
        (options, args) = parser.parse_args() 
        self.TSVFILE = options.tsvfile
        self.EXT = options.ext
        self.PREFIX = options.prefix
        self.PERCENT = int(options.percent)
        self.THREADS = int(options.threads)

class TSVFiles :
    """
    Methods for reading lists of paired read TSV files, and for
    writing lists to output.
    """
    def __init__(self):
        """
     	  Initializes arguments:
                READPAIRS = []

          """
        self.READPAIRS = []            

    def ReadTSVfile(self,FN) :
        """
        TSV file containing names of paired-end and/or single end read files.
        Paired-end files are on lines such as

        leftreadfile.fq.gz<TAB>rightreadfile.fq.gz

        Single-end files have a each file on a separate line

        reads1.fq.gz
        reads2.fq.gz
        reads3.fq.gz
        """
        TAB = '\t'
        F = open(FN,"r")
        for line in F.readlines() :
            line = line.strip()
            if len(line) > 0 and not line.startswith('#') :
                # get rid of double quotes that enclose fields when some programs write
                # output, and then split by TABs.
                tokens = line.replace('"','').split(TAB)

                # ignore blank fields. Add either single or pair of filenames
                # to list. Only process names from first two fields on a line
                # and ignore other fields. 
                if len(tokens) > 0 :
                    r1 = tokens[0].strip()
                    if len(r1) > 0 :
                        fnames = [r1]
                    else :
                        fnames = []
                    if len(tokens) > 1 :
                        r2 = tokens[1].strip()
                        if len(r2) > 0 :
                            fnames.append(r2)
                    if len(fnames) > 0 :
                        self.READPAIRS.append(fnames)
             
        if DEBUG :
            print(str(self.READPAIRS))
        F.close()


# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
def CreateSampleFiles(FNAMES,EXT,PREFIX,PERCENT,THREADS) :

    def WriteSampleFile(FN,OutName) :
        print("        running WriteSampleFile")
        # Run seqkit sample to generate the sample file
        COMSTR = ['seqkit','sample','--proportion',Proportion,'--rand-seed', Seed, '--threads', str(THREADS), '-o', OutName, FN]
        print(str(COMSTR))
        p = subprocess.Popen(COMSTR)
        p.wait()

    # . . . . . . . . . . . . . . 
   
    print("    running CreateSampleFiles")
    # Generate parameters
    Proportion = str(PERCENT/100.0)
    Seed = str(random.randint(-32767,32767))

    # single-end file
    if len(FNAMES) == 1 :
        FN1 = FNAMES[0]
        if FN1.endswith(EXT) :
            # Parse the filename into basename and extension
            BaseName1 = FN1[0:FN1.find(EXT)]
            # Create an output filename consisting of <BASENAME><PREFIX><EXTENSION>
            OutName1 = BaseName1 + PREFIX + EXT 
            # Run seqkit
            WriteSampleFile(FN1,OutName1)      
    # paired-end files
    else :
        FN1 = FNAMES[0]
        FN2 = FNAMES[1]
        if ( FN1.endswith(EXT) and FN2.endswith(EXT) ) :
            # Parse the filename into basename and extension
            BaseName1 = FN1[0:FN1.find(EXT)]
            BaseName2 = FN2[0:FN2.find(EXT)]
            # Create an output filename consisting of <BASENAME><PREFIX><EXTENSION>
            OutName1 = BaseName1 + PREFIX + EXT 
            OutName2 = BaseName2 + PREFIX + EXT
            # Run seqkit
            WriteSampleFile(FN1,OutName1)   
            WriteSampleFile(FN2,OutName2) 


#======================== MAIN PROCEDURE ==========================
def main():
    """
        Called when not in documentation mode.
        """
    print("running bl_seqkit_sample.py")
    # Read parameters from command line
    P = Parameters()

    # Read paired-end files used for sequence assembly
    TF = TSVFiles()
    if not P.TSVFILE == "" :
        TF.ReadTSVfile(P.TSVFILE)

    # For each pair of names, run seqkit sample to generate sample files. 
    for FNAMES in TF.READPAIRS :
        print("FNAMES: " + str(FNAMES))
        CreateSampleFiles(FNAMES,P.EXT,P.PREFIX,P.PERCENT,P.THREADS)  

if __name__ == "__main__":
    main()