#!/usr/bin/env python

################################################
# REMOVE DUPLICATE SEQUENCES FROM A FASTA FILE #
# DUPLICATION IS DETERMINED BY NUCLEOTIDE      #
# SEQUENCE IDENTITY, NOT BY SEQUENCE TAG/NAME! #
# THIS IS OPEN SOURCE CODE (YAY!)              #
################################################
# LICENSE
#    This code is licensed under the Creative Commons 3.0
#    Attribution + ShareAlike license - for details see:
#       http://creativecommons.org/licenses/by-sa/3.0/
#
# DATE: February 27th, 2014
#
# PROGRAM AUTHOR (PROGRAMMER)
#    Graham Alvare - home.cc.umanitoba.ca/~alvare
#
# CO-AUTHORS/ACKNOWLEDGEMENTS
#    Justin Zhang          - for providing me information on the SAM format,
#                            and test data to build and debug this program.
#    Dr. Brian Fristensky  - my work supervisor, and the man who introduced
#                            me to the wonderful field of bioinformatics.
#
# QUESTIONS & COMMENTS
#    If you have any questions, please contact me: alvare@cc.umanitoba.ca
#    I usually get back to people within 1-2 weekdays (weekends, I am slower)
#
#    P.S. Please also let me know of any bugs, or if you have any suggestions.
#         I am generally happy to help create new tools, or modify my existing
#         tools to make them more useful.
#
# Happy usage!

import sys, os, os.path, re, collections
from Bio import SeqIO

if __name__=="__main__":
	if len(sys.argv) > 1:
		for filename in sys.argv[1:]:
			unique_seqs = list()
			outfile = open("deduped_" + filename, "w")

			# Open the FASTA file and parse each sequence
			for record in SeqIO.parse(filename, 'fasta'):
				seq = str(record.seq)
				if seq not in unique_seqs:
					print >> outfile, "> " + str(record.id)
					print >> outfile, seq
					unique_seqs.append(seq)
			outfile.close()
	else:
		print "Synopsis: removes duplicate nucleotide sequences from a FASTA file"
		print "Usage:    fsa_dedup.py <fasta_files>"