############################################################################
# Copyright (c) 2015 Saint Petersburg State University
# Copyright (c) 2011-2014 Saint Petersburg Academic University
# All Rights Reserved
# See file LICENSE for details.
############################################################################

# based on IlluminaNxSeqJunction-Split7.py, IlluminaChimera-Clean4.py and ParseFastQ.py
# all copyrights are below!

# by Scott Monsma, Copyright (c) Lucigen Corp July 2014 - based on NxSeqFOS-SplitBfa4.py
# Splits 'mates_ICC4_' files into left and right insert sequences by finding the Junction Code(s)
# usage: copy IlluminaNxSeqJunction-Split6.py and ParseFastq.py into a directory with your fastq files to process
#cd into directory with .py and .fastq
#make sure your read 1 filename contains '_R1_' and read 2 filename contains '_R2_'
#at command prompt type 'python IlluminaNxSeqJunction-Split7.py 'mates_ICC4_your-R1-filename.fastq' and hit enter
#split sequences are saved if longer than minseq
#output files are named 'R1_IJS7_mates_ICC4_your-R1-filename.fastq' and 'R2_IJS7_mates_ICC4_your-R2-filename.fastq' which are the trimmed mate pairs, and
#'unsplit_IJS7_yourfilename.fastq' which contains interleaved reads where no junction was found.

#IlluminaChimera-Clean4 by Scott Monsma, Lucigen Corp Copyright (C) July 2014
# usage: copy IlluminaChimera-Clean4.py and ParseFastq.py into a directory with your fastq file to process
#cd into directory with .py and .fastq
#at command prompt type 'python IlluminaChimera-Clean4.py yourfilename.fastq' and hit enter
#four new files will be created, 'mates_ICC4_your-R1-filename.fastq' and 'mates_ICC4_your-R2-filename.fastq' containing the
#true mate pairs with matching chimera codes, and 'non-mates_ICC4_your-R1-filename.fastq' and 'non-mates_ICC4_your-R2-filename.fastq'
#containing the chimera read pairs and unidentified read pairs


import os
import time
import support
import gzip
import itertools
import sys
from site import addsitedir
import spades_init
import options_storage

try:
    import regex
except ImportError:
    support.error("Can't process Lucigen NxMate reads! Python module regex is not installed!")

addsitedir(spades_init.ext_python_modules_home)
if sys.version.startswith('2.'):
    from joblib2 import Parallel, delayed
elif sys.version.startswith('3.'):
    from joblib3 import Parallel, delayed

    
# CONSTANTS
READS_PER_THREAD = 25000
READS_PER_BATCH = READS_PER_THREAD * options_storage.threads  # e.g. 100000 for 4 threads
minseq = 25  # minimum length sequence to keep after trimming


class ParseFastQ(object):
    """Returns a read-by-read fastQ parser analogous to file.readline()"""

    def __init__(self, filePath, headerSymbols=['@', '+']):
        """Returns a read-by-read fastQ parser analogous to file.readline().
        Exmpl: parser.next()
        -OR-
        Its an iterator so you can do:
        for rec in parser:
            ... do something with rec ...

        rec is tuple: (seqHeader,seqStr,qualHeader,qualStr)
        """
        if filePath.endswith('.gz'):
            self._file = gzip.open(filePath)
        else:
            self._file = open(filePath, 'rU')  #filePath, 'rU') test with explicit filename
        self._currentLineNumber = 0
        self._hdSyms = headerSymbols

    def __iter__(self):
        return self

    def next(self):  # for both Python2 and Python3
        return self.__next__()

    def __next__(self):
        """Reads in next element, parses, and does minimal verification.
        Returns: tuple: (seqHeader,seqStr,qualHeader,qualStr)"""
        # ++++ Get Next Four Lines ++++
        elemList = []
        for i in range(4):
            line = self._file.readline()
            self._currentLineNumber += 1  ## increment file position
            if line:
                elemList.append(line.strip('\n'))
            else:
                elemList.append(None)

        # ++++ Check Lines For Expected Form ++++
        trues = [bool(x) for x in elemList].count(True)
        nones = elemList.count(None)
        # -- Check for acceptable end of file --
        if nones == 4:
            raise StopIteration
        # -- Make sure we got 4 full lines of data --
        assert trues == 4, \
            "** ERROR: It looks like I encountered a premature EOF or empty line.\n\
            Please check FastQ file near line number %s (plus or minus ~4 lines) and try again**" % (
            self._currentLineNumber)
        # -- Make sure we are in the correct "register" --
        assert elemList[0].startswith(self._hdSyms[0]), \
            "** ERROR: The 1st line in fastq element does not start with '%s'.\n\
            Please check FastQ file near line number %s (plus or minus ~4 lines) and try again**" % (
            self._hdSyms[0], self._currentLineNumber)
        assert elemList[2].startswith(self._hdSyms[1]), \
            "** ERROR: The 3rd line in fastq element does not start with '%s'.\n\
            Please check FastQ file near line number %s (plus or minus ~4 lines) and try again**" % (
            self._hdSyms[1], self._currentLineNumber)
        # -- Make sure the seq line and qual line have equal lengths --
        assert len(elemList[1]) == len(elemList[3]), "** ERROR: The length of Sequence data and Quality data of the last record aren't equal.\n\
               Please check FastQ file near line number %s (plus or minus ~4 lines) and try again**" % (
        self._currentLineNumber)

        # ++++ Return fatsQ data as tuple ++++
        return tuple(elemList)

    def close(self):
        if self._file:
            self._file.close()


def write_to_files(file_handlers, record_lists):
    for file_handler, record_list in zip(file_handlers, record_lists):
        for record in record_list:
            for line in record:
                file_handler.write(line + '\n')


def split_into_chunks(l, n):
    avg = len(l) / float(n)
    out = []
    last = 0.0
    while last < len(l):
        out.append(l[int(last):int(last + avg)])
        last += avg
    return out


class CleanStats(object):
    def __init__(self):
        self.readcounter = 0
        self.matecounter = 0  # for pairs where both trimmed reads are equal to or longer than minseq
        self.TOTALmatecounter = 0
        self.slagcounter = 0
        self.csscounter = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]  # 12 slots for the correct code combinations

    def __add__(self, other):
        self.readcounter += other.readcounter
        self.matecounter += other.matecounter
        self.TOTALmatecounter += other.TOTALmatecounter
        self.slagcounter += other.slagcounter
        self.csscounter = [x + y for x, y in zip(self.csscounter, other.csscounter)]
        return self


def chimera_clean_process_batch(reads, csslist1, csslist2):
    stats = CleanStats()
    processed_out1 = []
    processed_out2 = []
    processed_slag1 = []
    processed_slag2 = []

    #rec is tuple: (seqHeader,seqStr,qualHeader,qualStr)
    for recR1, recR2 in reads:
        stats.readcounter += 1

        #check if rec.seqStr contains match to chimera pattern
        for cssindex, css1 in enumerate(csslist1):
            m = regex.search(css1, recR1[1])
            css2 = csslist2[cssindex]
            n = regex.search(css2, recR2[1])

            if m and n:  # a true mate pair! write out to mates files
                stats.TOTALmatecounter += 1
                #NOTE  TAKE THIS OPPORTUNITY TO RECORD CSS CODE AND TRUNCATE READS
                #need to trim additional 9+4 nts from end of match to remove css, Bst, barcode (9) and CGAT (4) linker
                stats.csscounter[cssindex] += 1  #increment the appropriate css counter
                R1matches = m.span()
                mend = R1matches[1]
                mend = mend + 13
                mySeq = recR1[1]
                myR1 = mySeq[mend:]  #trim the left end off of Read1
                myQual1 = recR1[3]
                myR1Qual = myQual1[mend:]  #trim the left end off of Read1 quality string

                R2matches = n.span()
                nend = R2matches[1]
                nend = nend + 13
                mySeq2 = recR2[1]
                myR2 = mySeq2[nend:]  #trim the left end off of Read2
                myQual2 = recR2[3]
                myR2Qual = myQual2[nend:]  #trim the left end off of Read2 quality string

                if (len(myR1) >= minseq) and (len(myR2) >= minseq):  #and if one or other is too short, toss both
                    stats.matecounter += 1
                    processed_out1.append([recR1[0], myR1, recR1[2], myR1Qual])
                    processed_out2.append([recR2[0], myR2, recR2[2], myR2Qual])
                break  # found it, go on to next rec
            else:  # no chimera code in R1 so can't be a mate pair; write out to slag files if this is the last one
                if cssindex == 11:
                    stats.slagcounter += 1
                    processed_slag1.append([recR1[0], recR1[1], recR1[2], recR1[3]])
                    processed_slag2.append([recR2[0], recR2[1], recR2[2], recR2[3]])
    return [processed_out1, processed_out2, processed_slag1, processed_slag2], stats


def chimera_clean(infilename1, infilename2, dst, log, silent=True):
    starttime = time.time()

    basename1 = os.path.basename(infilename1)
    if os.path.splitext(basename1)[1] == '.gz':
        basename1 = os.path.splitext(basename1)[0]
    basename2 = os.path.basename(infilename2)
    if os.path.splitext(basename2)[1] == '.gz':
        basename2 = os.path.splitext(basename2)[0]
    #open four outfiles
    outfilename1 = os.path.join(dst, 'mates_ICC4_' + basename1)
    outfile1 = open(outfilename1, 'w')

    slagfilename1 = os.path.join(dst, 'non-mates_ICC4_' + basename1)
    slagfile1 = open(slagfilename1, 'w')

    outfilename2 = os.path.join(dst, 'mates_ICC4_' + basename2)
    outfile2 = open(outfilename2, 'w')

    slagfilename2 = os.path.join(dst, 'non-mates_ICC4_' + basename2)
    slagfile2 = open(slagfilename2, 'w')

    #set up regular expression patterns for chimera codes- for illumin use the  reverse complements of right codes
    csslist1 = ['(TGGACTCCACTGTG){e<=1}', '(ACTTCGCCACTGTG){e<=1}', '(TGAGTCCCACTGTG){e<=1}', '(TGACTGCCACTGTG){e<=1}',
                '(TCAGGTCCACTGTG){e<=1}', '(ATGTCACCACTGTG){e<=1}', '(GTATGACCACTGTG){e<=1}', '(GTCTACCCACTGTG){e<=1}',
                '(GTTGGACCACTGTG){e<=1}', '(CGATTCCCACTGTG){e<=1}', '(GGTTACCCACTGTG){e<=1}', '(TCACCTCCACTGTG){e<=1}']

    csslist2 = ['(TCCAGACCAATGTG){e<=1}', '(ACATCACCAATGTG){e<=1}', '(TCACGACCAATGTG){e<=1}', '(TAGCACCCAATGTG){e<=1}',
                '(AACCTCCCAATGTG){e<=1}', '(ACAACTCCAATGTG){e<=1}', '(GTCTAACCAATGTG){e<=1}', '(TACACGCCAATGTG){e<=1}',
                '(GAGAACCCAATGTG){e<=1}', '(GAGATTCCAATGTG){e<=1}', '(GACCTACCAATGTG){e<=1}', '(AGACTCCCAATGTG){e<=1}']

    #PARSE both files in tuples of 4 lines
    parserR1 = ParseFastQ(infilename1)
    parserR2 = ParseFastQ(infilename2)

    all_stats = CleanStats()
    n_jobs = options_storage.threads
    while True:
        # prepare input
        reads1 = list(itertools.islice(parserR1, READS_PER_BATCH))
        reads2 = list(itertools.islice(parserR2, READS_PER_BATCH))
        if len(reads1) != len(reads2):
            support.error("lucigen_nxmate.py, chimera_clean: "
                          "number of left reads (%d) is not equal to number of right reads (%d)!"
                          % (len(reads1), len(reads2)), log)
        if not reads1:
            break
        chunks = split_into_chunks(list(zip(reads1, reads2)), n_jobs)
        # processing
        outputs = Parallel(n_jobs=n_jobs)(delayed(chimera_clean_process_batch)(reads, csslist1, csslist2)
                                          for reads in chunks)
        results, stats = [x[0] for x in outputs], [x[1] for x in outputs]
        # writing results
        for result, stat in zip(results, stats):
            write_to_files([outfile1, outfile2, slagfile1, slagfile2], result)
            all_stats += stat
        if not silent:
            log.info("==== chimera_clean progress: reads processed: %d, time elapsed: %s"
                     % (all_stats.readcounter, time.strftime('%H:%M:%S', time.gmtime(time.time() - starttime))))
    parserR1.close()
    parserR2.close()

    outfile1.close()
    slagfile1.close()
    outfile2.close()
    slagfile2.close()

    if all_stats.TOTALmatecounter + all_stats.slagcounter != all_stats.readcounter:
        support.error("lucigen_nxmate.py, chimera_clean: error in the script somewhere! Unequal read counts!", log)
    if all_stats.readcounter == 0:
        support.error("lucigen_nxmate.py, chimera_clean: error in input data! Number of processed reads is 0!", log)
    if not silent:
        #print some stats
        percentmates = 100. * all_stats.matecounter / all_stats.readcounter
        percentslag = 100. * all_stats.slagcounter / all_stats.readcounter
        log.info("==== chimera_clean info: processing finished!")
        log.info("==== chimera_clean info: %d reads processed, %d true mate reads (%.2f %%) "
                 "and %d non-mates/chimeras (%.2f %%)."
                 % (all_stats.readcounter, all_stats.matecounter, percentmates, all_stats.slagcounter, percentslag))
        shortmates = all_stats.TOTALmatecounter - all_stats.matecounter
        log.info("==== chimera_clean info: %d mates too short to keep after trimming" % shortmates)
        elapsedtime = time.strftime('%H:%M:%S', time.gmtime(time.time() - starttime))
        log.info("==== chimera_clean info: time elapsed: %s" % (elapsedtime))
        log.info("==== chimera_clean info: " + str(all_stats.csscounter))
    return outfilename1, outfilename2


class JunctionStats(object):
    def __init__(self):
        self.readcounter = 0
        self.jctcounter = 0
        self.splitcounter = 0
        self.bothjctcounter = 0
        self.R1jctcounter = 0
        self.R2jctcounter = 0
        self.R1R2jctcounter = 0

    def __add__(self, other):
        self.readcounter += other.readcounter
        self.jctcounter += other.jctcounter
        self.splitcounter += other.splitcounter
        self.bothjctcounter += other.bothjctcounter
        self.R1jctcounter += other.R1jctcounter
        self.R2jctcounter += other.R2jctcounter
        self.R1R2jctcounter += other.R1R2jctcounter
        return self


def nx_seq_junction_process_batch(reads, jctstr):
    stats = JunctionStats()
    processed_split1 = []
    processed_split2 = []
    processed_unsplit = []

    for recR1, recR2 in reads:
        stats.readcounter += 1

        m = regex.search(jctstr, recR1[1])
        n = regex.search(jctstr, recR2[1])
        if m and n:  #found jctstr in both reads; need to save left part of R1 and LEFT part of R2
            stats.bothjctcounter += 1
            matches = m.span()
            start = matches[0]
            mySeq = recR1[1]  #get the left part of Read1
            myLeft = mySeq[:start]
            myQual = recR1[3]
            myLeftQual = myQual[:start]  #get left part of Read1 quality string

            nmatches = n.span()
            nstart = nmatches[0]
            mySeq2 = recR2[1]
            myRight2 = mySeq2[:nstart]  #get left part of Read2
            myQual2 = recR2[3]
            myRightQual2 = myQual2[:nstart]  #get left part of Read2 quality string

            #only write out as split if both pieces are big enough
            if (len(myLeft) > minseq) and (len(myRight2) > minseq):
                stats.splitcounter += 1
                stats.R1R2jctcounter += 1
                processed_split1.append([recR1[0], myLeft, recR1[2], myLeftQual])
                processed_split2.append([recR2[0], myRight2, recR2[2], myRightQual2])
        elif n:  #junction only in R2, so save entire R1 and LEFT part of R2 IFF R2 long enough
            nmatches = n.span()
            nstart = nmatches[0]
            mySeq2 = recR2[1]
            myRight2 = mySeq2[:nstart]
            myQual2 = recR2[3]
            myRightQual2 = myQual2[:nstart]
            if (len(myRight2) > minseq):
                stats.splitcounter += 1
                processed_split2.append([recR2[0], myRight2, recR2[2], myRightQual2])
                processed_split1.append([recR1[0], recR1[1], recR1[2], recR1[3]])
                stats.jctcounter += 1
                stats.R2jctcounter += 1
        elif m:  #junction only in R1, save left part of R1 and entire R2, IFF R1 is long enough
            matches = m.span()
            start = matches[0]
            mySeq = recR1[1]
            myLeft = mySeq[:start]
            myQual = recR1[3]
            myLeftQual = myQual[:start]
            if (len(myLeft) > minseq):
                stats.splitcounter += 1
                processed_split1.append([recR1[0], myLeft, recR1[2], myLeftQual])
                processed_split2.append([recR2[0], recR2[1], recR2[2], recR2[3]])
                stats.jctcounter += 1
                stats.R1jctcounter += 1
        else:  #no junctions, save for frag use, as is 'unsplit'; note this file will be interleaved R1 R2 R1 R2...
            processed_unsplit.append([recR1[0], recR1[1], recR1[2], recR1[3]])
            processed_unsplit.append([recR2[0], recR2[1], recR2[2], recR2[3]])
    return [processed_split1, processed_split2, processed_unsplit], stats


def nx_seq_junction(infilename1, infilename2, dst, log, silent=True):
    starttime = time.time()

    basename1 = os.path.basename(infilename1)
    if os.path.splitext(basename1)[1] == '.gz':
        basename1 = os.path.splitext(basename1)[0]
    basename2 = os.path.basename(infilename2)
    if os.path.splitext(basename2)[1] == '.gz':
        basename2 = os.path.splitext(basename2)[0]
    #open three outfiles
    splitfilenameleft = os.path.join(dst, 'R1_IJS7_' + basename1)
    splitfile1 = open(splitfilenameleft, 'w')

    splitfilenameright = os.path.join(dst, 'R2_IJS7_' + basename2)
    splitfile2 = open(splitfilenameright, 'w')

    unsplitfilename = os.path.join(dst, 'unsplit_IJS7_' + basename1.replace('_R1_', '_R1R2_'))
    unsplitfile = open(unsplitfilename, 'w')

    #jctstr = '(GGTTCATCGTCAGGCCTGACGATGAACC){e<=4}' # JS7 24/28 required results in ~92% detected in ion torrent
    # from NextClip: --adaptor_sequence GTTCATCGTCAGG -e --strict_match 22,11 --relaxed_match 20,10 eg strict 22/26 = 4 errors, relaxed 20/26 = 6 errors
    jctstr = '(GTTCATCGTCAGGCCTGACGATGAAC){e<=4}'  # try 22/26 to match NextClip strict (e<=6 for relaxed)

    #PARSE both files in tuples of 4 lines
    parserR1 = ParseFastQ(infilename1)
    parserR2 = ParseFastQ(infilename2)

    all_stats = JunctionStats()
    n_jobs = options_storage.threads
    while True:
        # prepare input
        reads1 = list(itertools.islice(parserR1, READS_PER_BATCH))
        reads2 = list(itertools.islice(parserR2, READS_PER_BATCH))
        if len(reads1) != len(reads2):
            support.error("lucigen_nxmate.py, nx_seq_junction: "
                          "number of left reads (%d) is not equal to number of right reads (%d)!"
                          % (len(reads1), len(reads2)), log)
        if not reads1:
            break
        chunks = split_into_chunks(list(zip(reads1, reads2)), n_jobs)
        # processing
        outputs = Parallel(n_jobs=n_jobs)(delayed(nx_seq_junction_process_batch)(reads, jctstr)
                                          for reads in chunks)
        results, stats = [x[0] for x in outputs], [x[1] for x in outputs]
        # writing results
        for result, stat in zip(results, stats):
            write_to_files([splitfile1, splitfile2, unsplitfile], result)
            all_stats += stat
        if not silent:
            log.info("==== nx_seq_junction progress: reads processed: %d, time elapsed: %s"
                     % (all_stats.readcounter, time.strftime('%H:%M:%S', time.gmtime(time.time() - starttime))))
    parserR1.close()
    parserR2.close()

    splitfile1.close()
    splitfile2.close()
    unsplitfile.close()

    if all_stats.readcounter == 0:
        support.error("lucigen_nxmate.py, nx_seq_junction: error in input data! Number of processed reads is 0!", log)
    if all_stats.splitcounter == 0:
        support.error("lucigen_nxmate.py, nx_seq_junction: error in input data! Number of split pairs is 0!", log)
    if not silent:
        #print some stats
        percentsplit = 100 * all_stats.splitcounter / all_stats.readcounter
        percentR1R2 = 100 * all_stats.R1R2jctcounter / all_stats.splitcounter
        percentR1 = 100 * all_stats.R1jctcounter / all_stats.splitcounter
        percentR2 = 100 * all_stats.R2jctcounter / all_stats.splitcounter
        log.info("==== nx_seq_junction info: processing finished!")
        log.info("==== nx_seq_junction info: %d reads processed" % (all_stats.readcounter))
        log.info("==== nx_seq_junction info: %d total split pairs (%.2f %% of processed reads))"
                 % (all_stats.splitcounter, percentsplit))
        log.info("==== nx_seq_junction info: %d junctions in both R1 and R2 (%.2f %% of split junctions))"
                 % (all_stats.R1R2jctcounter, percentR1R2))
        log.info("==== nx_seq_junction info: %d split junctions are in Read1 (%.2f %% of split junctions))"
                 % (all_stats.R1jctcounter, percentR1))
        log.info("==== nx_seq_junction info: %d split junctions are in Read2 (%.2f %% of split junctions))"
                 % (all_stats.R2jctcounter, percentR2))
        elapsedtime = time.strftime('%H:%M:%S', time.gmtime(time.time() - starttime))
        log.info("==== nx_seq_junction info: time elapsed: %s" % (elapsedtime))
    parserR1.close()
    parserR2.close()
    return splitfilenameleft, splitfilenameright, unsplitfilename


def process_reads(left_reads_fpath, right_reads_fpath, dst, log):
    log.info("== Processing Lucigen NxMate reads (" + left_reads_fpath + " and " +
             os.path.basename(right_reads_fpath) + " (results are in " + dst + " directory)")
    cleaned_filename1, cleaned_filename2 = chimera_clean(left_reads_fpath, right_reads_fpath, dst, log, silent=False)
    split_filename1, split_filename2, unsplit_filename = nx_seq_junction(cleaned_filename1, cleaned_filename2, dst, log, silent=False)
    return split_filename1, split_filename2, unsplit_filename