#!/usr/bin/env python

# Reads a FASTA file with adapters (argument 1) and identifies which
# of those adapters are present/used in a FASTQ file (argument 2).
# The identified adapters are printed to standard output.
#
# Usage: adapterg.py  adapter_query.fa  fastq_data.fq  > results.fa
# 
# An example adapter query file is included in the current directory:
#    illuminaClipping.fa_original

import sys, os, os.path, re, gzip
import subprocess, mimetypes
from subprocess import Popen, PIPE
from Bio import SeqIO

if len(sys.argv) > 1:
	ADAPTERS = sys.argv[1]
	FASTQ = sys.argv[2]

	fasta = dict()
	wanted = list()
	hitlist = list()


	for record in SeqIO.parse(ADAPTERS, 'fasta'):
		index = str(record.id)
		seq   = str(record.seq).lower()
		fasta[seq] = index
		wanted.append(seq)

	linenum = 0
	if mimetypes.guess_type(FASTQ)[1] == 'gzip':
		infile=gzip.open(FASTQ)
	else:
		infile = open(FASTQ)

	for line in infile:
		for seq in wanted:
			if line.lower().find(seq) >= 0:
				wanted.remove(seq)
				hitlist.append(seq)
		if len(wanted) == 0 or linenum >= 80000:
			break
		linenum += 1

	for seq in hitlist:
		print "> " + fasta[seq]
		print seq