#!/usr/bin/env python3

# Synopsis: adaptercheck.py fastqlist adapterfile olen reportfile

# filelist - list of fastq/fasta files to search for adapter contamination
#          The easy way to create this file is something like
#          ls -1 *.fastq > fastqlist
# adapterfile - fasta file with 1 or more adapter sequences
# olen - for a given adapter, search for n nucleotides from the 3' end
# reportfile - write output to reportfile

# Given a list of oligonucleotides from 5' or 3' ends of 
# sequencing adapters, write the number of occurrences of 
# each to the output file. Do this for each file with
# the specified extension.

# If the fastq file has many thousands of instances, the
# adapter is probably contaminating the reads.
# If there are a very small number of hits, it can be
# attributed to random chance.

# This program calls the Unix grep command for fast searches

# To make results comparable across adapters, all oligos
# should be a uniform length eg. 10 nt.

#See Adapter trimming: Why are adapter sequences trimmed from only the 3' ends of reads?
# at https://support.illumina.com/bulletins/2016/04/adapter-trimming-why-are-adapter-sequences-trimmed-from-only-the--ends-of-reads.html

import sys
import os


NL="\n"
TAB="\t"
PID = str(os.getpid()) # process id to use for temporary files
TMPFN = "adapterchecktmp." + PID
NUCLEOTIDES = ['A','G','C','T','N']

#------------------------------------------------
class Adapter :
    """
    Name and sequence of adapter, from fasta file
    """
    def __init__(self):
        """
        Initializes arguments
        """
        self.Name = ""
        self.Seq = ""
        self.SeqLen = 0
        self.Oligo5 = ""
        self.Oligo3 = ""

#------------------------------------------------
class SeqData:

    def __init__(self):
        """
        Holds sequences and associated data
        """
        self.SeqLst = []
        self.NumSeq = 0


#------------------------------------------------
def ReadFasta(adapterfile, S, OLEN):
    """
    Read sequences from adapterfile in the form:

    >name
    sequence
    sequence
    sequence...
    
    """

    in_file = open(adapterfile, 'r')

    line = in_file.readline()
    S.NumSeq = 0
    while line != '':
        if line[0] == '>':
            tempseq = Adapter()
            tempseq.Name = line[1:].strip()
            line = in_file.readline()
            while line != '' and line[0] != '>':
                tempseq.Seq = tempseq.Seq + line.strip()
                line = in_file.readline()
            tempseq.SeqLen = len(tempseq.Seq)
            if tempseq.SeqLen > OLEN :
                tempseq.Oligo5 = tempseq.Seq[0:OLEN]
                tempseq.Oligo3 = tempseq.Seq[-OLEN:tempseq.SeqLen-1]
            S.SeqLst.append(tempseq)
            #S.SeqLen = len(S.SeqLst[S.NumSeq].Seq)
            S.NumSeq = S.NumSeq + 1

    in_file.close()

#------------------------------------------------
class FileList :
    """
    Name and sequence of adapter, from fasta file
    """
    def __init__(self):
        """
        Initializes arguments
        """
        self.NAMES = []

    def ReadFileList(self,FN) :
        f = open(FN,"r")
        for line in f.readlines() :
            self.NAMES.append(line.strip())
            
        f.close()
        return

#------------------------------------------------
def GetLine(FN) :
    """
    Read first line of a file
    """
    f = open(FN,"r")
    result = f.read().strip()
    f.close()
    return result

#------------------------------------------------
def EstReadLen(fqfile) :
    """
    Read the first 1000 reads and estimate the average read length.
    """
    f = open(fqfile,"r")
    
    line = f.readline()
    NumSeq = 0
    totallen = 0
    while (line != '') and (NumSeq <= 1000):
        if line[0] in NUCLEOTIDES :
            #print(line.strip())
            totallen = totallen + len(line.strip())            
            NumSeq = NumSeq + 1
        line = f.readline()
    f.close()
    #print("totallen: " + str(totallen) + " NumSeq: " + str(NumSeq))
    if NumSeq > 0 :
        result = totallen/NumSeq # Average read length
    else :
        result = 0
    return result

#------------------------------------------------
def GrepSeqCount(fqfile) :
    """
    Call grep | wc -l to count total number of sequences in a fastq file 
    """

    # Why do we do it this way when everyone on various web sites tells you to
    # use subprocess? Because ALL of the examples for complex Unix commands require
    # Shell=True, which is considered a security risk.
    #COMMAND = "grep " + "'@'" + " < " + fqfile + " | wc -l > " + TMPFN
    COMMAND = "wc -l < " + fqfile + " > " + TMPFN
    os.system(COMMAND)
    tmpresult = int(GetLine(TMPFN))  
    result = tmpresult/4 # Each sequence in a fastq file has 4 lines
    return result

#------------------------------------------------
def GrepCount(fqfile,oligo) :
    """
    Call grep | wc -l to count number of lines in a fastq file containing oligo
    """

    # Why do we do it this way when everyone on various web sites tells you to
    # use subprocess? Because ALL of the examples for complex Unix commands require
    # Shell=True, which is considered a security risk.
    COMMAND = "grep " + oligo + " < " + fqfile + " | wc -l > " + TMPFN
    os.system(COMMAND)
    result = int(GetLine(TMPFN))  
    return result

#======================= MAIN ================================

fastqlist = sys.argv[1]
adapterfile = sys.argv[2]
olen = int(sys.argv[3])
reportfile = sys.argv[4]


print("fastqlist: " + fastqlist)
print("adapterfile: " + adapterfile)
print("olen: " + str(olen))
print("reportfile: " + reportfile)

FILES = FileList()
FILES.ReadFileList(fastqlist)


rfile=open(reportfile,'w')
rfile.write("__________ adaptercheck.py __________" + NL)
rfile.write(NL)
rfile.write("Adapter file: " + TAB + adapterfile + NL)
rfile.write("5' oligo length: " + TAB + str(olen) + NL)
rfile.write(NL)

S = SeqData()
ReadFasta(adapterfile,S,olen)

for F in FILES.NAMES :
    totalseqs = GrepSeqCount(F)
    print("Total sequences: " + str(totalseqs))
    readlen = EstReadLen(F)
    print("Est. read len.: " +  str(readlen))
    Expect = (totalseqs * readlen) / (4**olen)
    rfile.write("------ " + F + " ------" + TAB + "total seqs: " + TAB + str(totalseqs) + NL)
    rfile.write(TAB + str(olen) + "-mer Expect: " + TAB + str(Expect) + NL)
    for adapter in S.SeqLst :
        N = GrepCount(F,adapter.Oligo5)
        rfile.write(adapter.Name + TAB + adapter.Oligo5 + TAB + str(N) + NL)
    rfile.write(NL)

rfile.close()
if os.path.exists(TMPFN) :
    os.remove(TMPFN)