#!/usr/bin/env python

################################################
# SORTS THE SEQUENCES IN A FASTA FILE BY NAME  #
# THIS IS OPEN SOURCE CODE (YAY!)              #
################################################
# LICENSE
#    This code is licensed under the Creative Commons 3.0
#    Attribution + ShareAlike license - for details see:
#       http://creativecommons.org/licenses/by-sa/3.0/
#
# DATE: February 27th, 2014
#
# PROGRAM AUTHOR (PROGRAMMER)
#    Graham Alvare - home.cc.umanitoba.ca/~alvare
#
# CO-AUTHORS/ACKNOWLEDGEMENTS
#    Justin Zhang          - for providing me information on the SAM format,
#                            and test data to build and debug this program.
#    Dr. Brian Fristensky  - my work supervisor, and the man who introduced
#                            me to the wonderful field of bioinformatics.
#
# QUESTIONS & COMMENTS
#    If you have any questions, please contact me: alvare@cc.umanitoba.ca
#    I usually get back to people within 1-2 weekdays (weekends, I am slower)
#
#    P.S. Please also let me know of any bugs, or if you have any suggestions.
#         I am generally happy to help create new tools, or modify my existing
#         tools to make them more useful.
#
# Happy usage!

import sys, os, os.path, re, collections
from Bio import SeqIO

if __name__=="__main__":
	if len(sys.argv) > 1:
		for filename in sys.argv[1:]:
			sequences = dict()
			# Print the FASTA file header
			print "File: " + filename

			# Open the FASTA file and parse each sequence
			for record in SeqIO.parse(filename, 'fasta'):
				if record.id in sequences:
					print "    Skipping duplicate sequence " + record.id
				else:
					sequences[record.id] = record.seq

			outname = os.path.splitext(filename)[0] + ".sorted.fa"
			outfile = open(outname, 'w')
			for key in sorted(sequences, key=lambda k: k.lower()):
				print >> outfile, "> " + str(key)
				print >> outfile, str(sequences[key])
			outfile.close()
	else:
		print "Usage:  fsa_len.py <fasta_files>"
		print "Output: the length of each sequence in each FASTA file"