#!/usr/bin/env python # Reads a FASTA file with adapters (argument 1) and identifies which # of those adapters are present/used in a FASTQ file (argument 2). # The identified adapters are printed to standard output. # # Usage: adapterg.py adapter_query.fa fastq_data.fq > results.fa # # An example adapter query file is included in the current directory: # illuminaClipping.fa_original import sys, os, os.path, re, gzip import subprocess, mimetypes from subprocess import Popen, PIPE from Bio import SeqIO if len(sys.argv) > 1: ADAPTERS = sys.argv[1] FASTQ = sys.argv[2] fasta = dict() wanted = list() hitlist = list() for record in SeqIO.parse(ADAPTERS, 'fasta'): index = str(record.id) seq = str(record.seq).lower() fasta[seq] = index wanted.append(seq) linenum = 0 if mimetypes.guess_type(FASTQ)[1] == 'gzip': infile=gzip.open(FASTQ) else: infile = open(FASTQ) for line in infile: for seq in wanted: if line.lower().find(seq) >= 0: wanted.remove(seq) hitlist.append(seq) if len(wanted) == 0 or linenum >= 80000: break linenum += 1 for seq in hitlist: print "> " + fasta[seq] print seq