#!/usr/bin/env python3 # Synopsis: adaptercheck.py fastqlist adapterfile olen reportfile # filelist - list of fastq/fasta files to search for adapter contamination # The easy way to create this file is something like # ls -1 *.fastq > fastqlist # adapterfile - fasta file with 1 or more adapter sequences # olen - for a given adapter, search for n nucleotides from the 3' end # reportfile - write output to reportfile # Given a list of oligonucleotides from 5' or 3' ends of # sequencing adapters, write the number of occurrences of # each to the output file. Do this for each file with # the specified extension. # If the fastq file has many thousands of instances, the # adapter is probably contaminating the reads. # If there are a very small number of hits, it can be # attributed to random chance. # This program calls the Unix grep command for fast searches # To make results comparable across adapters, all oligos # should be a uniform length eg. 10 nt. #See Adapter trimming: Why are adapter sequences trimmed from only the 3' ends of reads? # at https://support.illumina.com/bulletins/2016/04/adapter-trimming-why-are-adapter-sequences-trimmed-from-only-the--ends-of-reads.html import sys import os NL="\n" TAB="\t" PID = str(os.getpid()) # process id to use for temporary files TMPFN = "adapterchecktmp." + PID NUCLEOTIDES = ['A','G','C','T','N'] #------------------------------------------------ class Adapter : """ Name and sequence of adapter, from fasta file """ def __init__(self): """ Initializes arguments """ self.Name = "" self.Seq = "" self.SeqLen = 0 self.Oligo5 = "" self.Oligo3 = "" #------------------------------------------------ class SeqData: def __init__(self): """ Holds sequences and associated data """ self.SeqLst = [] self.NumSeq = 0 #------------------------------------------------ def ReadFasta(adapterfile, S, OLEN): """ Read sequences from adapterfile in the form: >name sequence sequence sequence... """ in_file = open(adapterfile, 'r') line = in_file.readline() S.NumSeq = 0 while line != '': if line[0] == '>': tempseq = Adapter() tempseq.Name = line[1:].strip() line = in_file.readline() while line != '' and line[0] != '>': tempseq.Seq = tempseq.Seq + line.strip() line = in_file.readline() tempseq.SeqLen = len(tempseq.Seq) if tempseq.SeqLen > OLEN : tempseq.Oligo5 = tempseq.Seq[0:OLEN] tempseq.Oligo3 = tempseq.Seq[-OLEN:tempseq.SeqLen-1] S.SeqLst.append(tempseq) #S.SeqLen = len(S.SeqLst[S.NumSeq].Seq) S.NumSeq = S.NumSeq + 1 in_file.close() #------------------------------------------------ class FileList : """ Name and sequence of adapter, from fasta file """ def __init__(self): """ Initializes arguments """ self.NAMES = [] def ReadFileList(self,FN) : f = open(FN,"r") for line in f.readlines() : self.NAMES.append(line.strip()) f.close() return #------------------------------------------------ def GetLine(FN) : """ Read first line of a file """ f = open(FN,"r") result = f.read().strip() f.close() return result #------------------------------------------------ def EstReadLen(fqfile) : """ Read the first 1000 reads and estimate the average read length. """ f = open(fqfile,"r") line = f.readline() NumSeq = 0 totallen = 0 while (line != '') and (NumSeq <= 1000): if line[0] in NUCLEOTIDES : #print(line.strip()) totallen = totallen + len(line.strip()) NumSeq = NumSeq + 1 line = f.readline() f.close() #print("totallen: " + str(totallen) + " NumSeq: " + str(NumSeq)) if NumSeq > 0 : result = totallen/NumSeq # Average read length else : result = 0 return result #------------------------------------------------ def GrepSeqCount(fqfile) : """ Call grep | wc -l to count total number of sequences in a fastq file """ # Why do we do it this way when everyone on various web sites tells you to # use subprocess? Because ALL of the examples for complex Unix commands require # Shell=True, which is considered a security risk. #COMMAND = "grep " + "'@'" + " < " + fqfile + " | wc -l > " + TMPFN COMMAND = "wc -l < " + fqfile + " > " + TMPFN os.system(COMMAND) tmpresult = int(GetLine(TMPFN)) result = tmpresult/4 # Each sequence in a fastq file has 4 lines return result #------------------------------------------------ def GrepCount(fqfile,oligo) : """ Call grep | wc -l to count number of lines in a fastq file containing oligo """ # Why do we do it this way when everyone on various web sites tells you to # use subprocess? Because ALL of the examples for complex Unix commands require # Shell=True, which is considered a security risk. COMMAND = "grep " + oligo + " < " + fqfile + " | wc -l > " + TMPFN os.system(COMMAND) result = int(GetLine(TMPFN)) return result #======================= MAIN ================================ fastqlist = sys.argv[1] adapterfile = sys.argv[2] olen = int(sys.argv[3]) reportfile = sys.argv[4] print("fastqlist: " + fastqlist) print("adapterfile: " + adapterfile) print("olen: " + str(olen)) print("reportfile: " + reportfile) FILES = FileList() FILES.ReadFileList(fastqlist) rfile=open(reportfile,'w') rfile.write("__________ adaptercheck.py __________" + NL) rfile.write(NL) rfile.write("Adapter file: " + TAB + adapterfile + NL) rfile.write("5' oligo length: " + TAB + str(olen) + NL) rfile.write(NL) S = SeqData() ReadFasta(adapterfile,S,olen) for F in FILES.NAMES : totalseqs = GrepSeqCount(F) print("Total sequences: " + str(totalseqs)) readlen = EstReadLen(F) print("Est. read len.: " + str(readlen)) Expect = (totalseqs * readlen) / (4**olen) rfile.write("------ " + F + " ------" + TAB + "total seqs: " + TAB + str(totalseqs) + NL) rfile.write(TAB + str(olen) + "-mer Expect: " + TAB + str(Expect) + NL) for adapter in S.SeqLst : N = GrepCount(F,adapter.Oligo5) rfile.write(adapter.Name + TAB + adapter.Oligo5 + TAB + str(N) + NL) rfile.write(NL) rfile.close() if os.path.exists(TMPFN) : os.remove(TMPFN)