#!/usr/bin/env python ################################################ # REMOVE A LIST OF SEQUENCES IN ONE FASTA FILE # # FROM ANOTHER FASTA FILE - USED FOR DELETING # # RRNA FROM FASTA FILES OF CONTIGS/CHROMOSOMES # # THIS IS OPEN SOURCE CODE (YAY!) # ################################################ # LICENSE # This code is licensed under the Creative Commons 3.0 # Attribution + ShareAlike license - for details see: # http://creativecommons.org/licenses/by-sa/3.0/ # # DATE: February 27th, 2014 # # PROGRAM AUTHOR (PROGRAMMER) # Graham Alvare - home.cc.umanitoba.ca/~alvare # # CO-AUTHORS/ACKNOWLEDGEMENTS # Justin Zhang - for providing me information on the SAM format, # and test data to build and debug this program. # Dr. Brian Fristensky - my work supervisor, and the man who introduced # me to the wonderful field of bioinformatics. # # QUESTIONS & COMMENTS # If you have any questions, please contact me: alvare@cc.umanitoba.ca # I usually get back to people within 1-2 weekdays (weekends, I am slower) # # P.S. Please also let me know of any bugs, or if you have any suggestions. # I am generally happy to help create new tools, or modify my existing # tools to make them more useful. # # Happy usage! import sys, os, os.path, re, collections from Bio import SeqIO if __name__=="__main__": if len(sys.argv) > 2: # Get the filenames to open genomename = sys.argv[1] filtername = sys.argv[2] # Generate the filter list seqfilter = list() # Open output FASTA filter file for record in SeqIO.parse(filtername, 'fasta'): seqfilter.append(str(record.seq)) ################################ # Filter the Genome FASTA file # ################################ # Open the Genome output FASTA file outfile = open(genomename + "_filtered", 'w') # Open the Genome input FASTA file for record in SeqIO.parse(genomename, 'fasta'): # Read in the current Genome sequence gseq = str(record.seq) # Remove the occurences of each filter # sequence from the Genome sequence for fseq in seqfilter: gseq = gseq.replace(fseq, '') # Output the filtered Genome sequence # to the Genome results FASTA file print >> outfile, "> " + str(record.id) print >> outfile, gseq # Close the output file handle outfile.close()